[flang-commits] [flang] c09d3c1 - [flang] Add PowerPC MMA intrinsics - part 2

Daniel Chen via flang-commits flang-commits at lists.llvm.org
Tue Aug 15 11:33:07 PDT 2023


Author: Daniel Chen
Date: 2023-08-15T13:56:28-04:00
New Revision: c09d3c1ead632e39c19a78049f6e663b6dbebc4e

URL: https://github.com/llvm/llvm-project/commit/c09d3c1ead632e39c19a78049f6e663b6dbebc4e
DIFF: https://github.com/llvm/llvm-project/commit/c09d3c1ead632e39c19a78049f6e663b6dbebc4e.diff

LOG: [flang] Add PowerPC MMA intrinsics - part 2

Added: 
    flang/test/Lower/PowerPC/ppc-mma-accumulator-move-clear.f90
    flang/test/Lower/PowerPC/ppc-mma-outer-product-1.f90
    flang/test/Lower/PowerPC/ppc-mma-outer-product-2.f90

Modified: 
    flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
    flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
    flang/module/mma.f90
    flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
index e5344922c2c476..68ae78abf9dc97 100644
--- a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
@@ -72,6 +72,70 @@ enum class MMAOp {
   AssemblePair,
   DisassembleAcc,
   DisassemblePair,
+  Xxmfacc,
+  Xxmtacc,
+  Xxsetaccz,
+  Pmxvbf16ger2,
+  Pmxvbf16ger2nn,
+  Pmxvbf16ger2np,
+  Pmxvbf16ger2pn,
+  Pmxvbf16ger2pp,
+  Pmxvf16ger2,
+  Pmxvf16ger2nn,
+  Pmxvf16ger2np,
+  Pmxvf16ger2pn,
+  Pmxvf16ger2pp,
+
+  Pmxvf32ger,
+  Pmxvf32gernn,
+  Pmxvf32gernp,
+  Pmxvf32gerpn,
+  Pmxvf32gerpp,
+  Pmxvf64ger,
+  Pmxvf64gernn,
+  Pmxvf64gernp,
+  Pmxvf64gerpn,
+  Pmxvf64gerpp,
+
+  Pmxvi16ger2,
+  Pmxvi16ger2pp,
+  Pmxvi16ger2s,
+  Pmxvi16ger2spp,
+  Pmxvi4ger8,
+  Pmxvi4ger8pp,
+  Pmxvi8ger4,
+  Pmxvi8ger4pp,
+  Pmxvi8ger4spp,
+
+  Xvbf16ger2,
+  Xvbf16ger2nn,
+  Xvbf16ger2np,
+  Xvbf16ger2pn,
+  Xvbf16ger2pp,
+  Xvf16ger2,
+  Xvf16ger2nn,
+  Xvf16ger2np,
+  Xvf16ger2pn,
+  Xvf16ger2pp,
+  Xvf32ger,
+  Xvf32gernn,
+  Xvf32gernp,
+  Xvf32gerpn,
+  Xvf32gerpp,
+  Xvf64ger,
+  Xvf64gernn,
+  Xvf64gernp,
+  Xvf64gerpn,
+  Xvf64gerpp,
+  Xvi16ger2,
+  Xvi16ger2pp,
+  Xvi16ger2s,
+  Xvi16ger2spp,
+  Xvi4ger8,
+  Xvi4ger8pp,
+  Xvi8ger4,
+  Xvi8ger4pp,
+  Xvi8ger4spp,
 };
 
 enum class MMAHandlerOp {

diff  --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
index f47780dd8cd8c1..9ce09262edf555 100644
--- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
@@ -59,6 +59,451 @@ static constexpr IntrinsicHandler ppcHandlers[]{
          &PI::genMmaIntr<MMAOp::DisassemblePair, MMAHandlerOp::SubToFunc>),
      {{{"data", asAddr}, {"pair", asValue}}},
      /*isElemental=*/true},
+    {"__ppc_mma_pmxvbf16ger2_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvbf16ger2, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvbf16ger2nn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvbf16ger2nn,
+                         MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvbf16ger2np",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvbf16ger2np,
+                         MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvbf16ger2pn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvbf16ger2pn,
+                         MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvbf16ger2pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvbf16ger2pp,
+                         MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf16ger2_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf16ger2, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf16ger2nn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf16ger2nn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf16ger2np",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf16ger2np, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf16ger2pn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf16ger2pn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf16ger2pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf32ger",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf32ger, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf32gernn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf32gernn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf32gernp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf32gernp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf32gerpn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf32gerpn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf32gerpp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf32gerpp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf64ger",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf64ger, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf64gernn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf64gernn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf64gernp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf64gernp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf64gerpn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf64gerpn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvf64gerpp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvf64gerpp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvi16ger2_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvi16ger2, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvi16ger2pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvi16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvi16ger2s",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvi16ger2s, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvi16ger2spp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvi16ger2spp,
+                         MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvi4ger8_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvi4ger8, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvi4ger8pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvi4ger8pp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvi8ger4_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvi8ger4, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvi8ger4pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvi8ger4pp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_pmxvi8ger4spp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Pmxvi8ger4spp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr},
+       {"a", asValue},
+       {"b", asValue},
+       {"xmask", asValue},
+       {"ymask", asValue},
+       {"pmask", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvbf16ger2_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvbf16ger2, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvbf16ger2nn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvbf16ger2nn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvbf16ger2np",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvbf16ger2np, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvbf16ger2pn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvbf16ger2pn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvbf16ger2pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvbf16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf16ger2_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf16ger2, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf16ger2nn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf16ger2nn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf16ger2np",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf16ger2np, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf16ger2pn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf16ger2pn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf16ger2pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf32ger",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf32ger, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf32gernn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf32gernn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf32gernp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf32gernp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf32gerpn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf32gerpn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf32gerpp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf32gerpp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf64ger",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf64ger, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf64gernn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf64gernn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf64gernp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf64gernp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf64gerpn",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf64gerpn, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvf64gerpp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvf64gerpp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvi16ger2_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvi16ger2, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvi16ger2pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvi16ger2pp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvi16ger2s",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvi16ger2s, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvi16ger2spp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvi16ger2spp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvi4ger8_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvi4ger8, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvi4ger8pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvi4ger8pp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvi8ger4_",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvi8ger4, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvi8ger4pp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvi8ger4pp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xvi8ger4spp",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xvi8ger4spp, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}, {"a", asValue}, {"b", asValue}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xxmfacc",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xxmfacc, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xxmtacc",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xxmtacc, MMAHandlerOp::FirstArgIsResult>),
+     {{{"acc", asAddr}}},
+     /*isElemental=*/true},
+    {"__ppc_mma_xxsetaccz",
+     static_cast<IntrinsicLibrary::SubroutineGenerator>(
+         &PI::genMmaIntr<MMAOp::Xxsetaccz, MMAHandlerOp::SubToFunc>),
+     {{{"acc", asAddr}}},
+     /*isElemental=*/true},
     {"__ppc_mtfsf",
      static_cast<IntrinsicLibrary::SubroutineGenerator>(&PI::genMtfsf<false>),
      {{{"mask", asValue}, {"r", asValue}}},
@@ -1836,6 +2281,128 @@ const char *getMmaIrIntrName(MMAOp mmaOp) {
     return "llvm.ppc.mma.disassemble.acc";
   case MMAOp::DisassemblePair:
     return "llvm.ppc.vsx.disassemble.pair";
+  case MMAOp::Xxmfacc:
+    return "llvm.ppc.mma.xxmfacc";
+  case MMAOp::Xxmtacc:
+    return "llvm.ppc.mma.xxmtacc";
+  case MMAOp::Xxsetaccz:
+    return "llvm.ppc.mma.xxsetaccz";
+  case MMAOp::Pmxvbf16ger2:
+    return "llvm.ppc.mma.pmxvbf16ger2";
+  case MMAOp::Pmxvbf16ger2nn:
+    return "llvm.ppc.mma.pmxvbf16ger2nn";
+  case MMAOp::Pmxvbf16ger2np:
+    return "llvm.ppc.mma.pmxvbf16ger2np";
+  case MMAOp::Pmxvbf16ger2pn:
+    return "llvm.ppc.mma.pmxvbf16ger2pn";
+  case MMAOp::Pmxvbf16ger2pp:
+    return "llvm.ppc.mma.pmxvbf16ger2pp";
+  case MMAOp::Pmxvf16ger2:
+    return "llvm.ppc.mma.pmxvf16ger2";
+  case MMAOp::Pmxvf16ger2nn:
+    return "llvm.ppc.mma.pmxvf16ger2nn";
+  case MMAOp::Pmxvf16ger2np:
+    return "llvm.ppc.mma.pmxvf16ger2np";
+  case MMAOp::Pmxvf16ger2pn:
+    return "llvm.ppc.mma.pmxvf16ger2pn";
+  case MMAOp::Pmxvf16ger2pp:
+    return "llvm.ppc.mma.pmxvf16ger2pp";
+  case MMAOp::Pmxvf32ger:
+    return "llvm.ppc.mma.pmxvf32ger";
+  case MMAOp::Pmxvf32gernn:
+    return "llvm.ppc.mma.pmxvf32gernn";
+  case MMAOp::Pmxvf32gernp:
+    return "llvm.ppc.mma.pmxvf32gernp";
+  case MMAOp::Pmxvf32gerpn:
+    return "llvm.ppc.mma.pmxvf32gerpn";
+  case MMAOp::Pmxvf32gerpp:
+    return "llvm.ppc.mma.pmxvf32gerpp";
+  case MMAOp::Pmxvf64ger:
+    return "llvm.ppc.mma.pmxvf64ger";
+  case MMAOp::Pmxvf64gernn:
+    return "llvm.ppc.mma.pmxvf64gernn";
+  case MMAOp::Pmxvf64gernp:
+    return "llvm.ppc.mma.pmxvf64gernp";
+  case MMAOp::Pmxvf64gerpn:
+    return "llvm.ppc.mma.pmxvf64gerpn";
+  case MMAOp::Pmxvf64gerpp:
+    return "llvm.ppc.mma.pmxvf64gerpp";
+  case MMAOp::Pmxvi16ger2:
+    return "llvm.ppc.mma.pmxvi16ger2";
+  case MMAOp::Pmxvi16ger2pp:
+    return "llvm.ppc.mma.pmxvi16ger2pp";
+  case MMAOp::Pmxvi16ger2s:
+    return "llvm.ppc.mma.pmxvi16ger2s";
+  case MMAOp::Pmxvi16ger2spp:
+    return "llvm.ppc.mma.pmxvi16ger2spp";
+  case MMAOp::Pmxvi4ger8:
+    return "llvm.ppc.mma.pmxvi4ger8";
+  case MMAOp::Pmxvi4ger8pp:
+    return "llvm.ppc.mma.pmxvi4ger8pp";
+  case MMAOp::Pmxvi8ger4:
+    return "llvm.ppc.mma.pmxvi8ger4";
+  case MMAOp::Pmxvi8ger4pp:
+    return "llvm.ppc.mma.pmxvi8ger4pp";
+  case MMAOp::Pmxvi8ger4spp:
+    return "llvm.ppc.mma.pmxvi8ger4spp";
+  case MMAOp::Xvbf16ger2:
+    return "llvm.ppc.mma.xvbf16ger2";
+  case MMAOp::Xvbf16ger2nn:
+    return "llvm.ppc.mma.xvbf16ger2nn";
+  case MMAOp::Xvbf16ger2np:
+    return "llvm.ppc.mma.xvbf16ger2np";
+  case MMAOp::Xvbf16ger2pn:
+    return "llvm.ppc.mma.xvbf16ger2pn";
+  case MMAOp::Xvbf16ger2pp:
+    return "llvm.ppc.mma.xvbf16ger2pp";
+  case MMAOp::Xvf16ger2:
+    return "llvm.ppc.mma.xvf16ger2";
+  case MMAOp::Xvf16ger2nn:
+    return "llvm.ppc.mma.xvf16ger2nn";
+  case MMAOp::Xvf16ger2np:
+    return "llvm.ppc.mma.xvf16ger2np";
+  case MMAOp::Xvf16ger2pn:
+    return "llvm.ppc.mma.xvf16ger2pn";
+  case MMAOp::Xvf16ger2pp:
+    return "llvm.ppc.mma.xvf16ger2pp";
+  case MMAOp::Xvf32ger:
+    return "llvm.ppc.mma.xvf32ger";
+  case MMAOp::Xvf32gernn:
+    return "llvm.ppc.mma.xvf32gernn";
+  case MMAOp::Xvf32gernp:
+    return "llvm.ppc.mma.xvf32gernp";
+  case MMAOp::Xvf32gerpn:
+    return "llvm.ppc.mma.xvf32gerpn";
+  case MMAOp::Xvf32gerpp:
+    return "llvm.ppc.mma.xvf32gerpp";
+  case MMAOp::Xvf64ger:
+    return "llvm.ppc.mma.xvf64ger";
+  case MMAOp::Xvf64gernn:
+    return "llvm.ppc.mma.xvf64gernn";
+  case MMAOp::Xvf64gernp:
+    return "llvm.ppc.mma.xvf64gernp";
+  case MMAOp::Xvf64gerpn:
+    return "llvm.ppc.mma.xvf64gerpn";
+  case MMAOp::Xvf64gerpp:
+    return "llvm.ppc.mma.xvf64gerpp";
+  case MMAOp::Xvi16ger2:
+    return "llvm.ppc.mma.xvi16ger2";
+  case MMAOp::Xvi16ger2pp:
+    return "llvm.ppc.mma.xvi16ger2pp";
+  case MMAOp::Xvi16ger2s:
+    return "llvm.ppc.mma.xvi16ger2s";
+  case MMAOp::Xvi16ger2spp:
+    return "llvm.ppc.mma.xvi16ger2spp";
+  case MMAOp::Xvi4ger8:
+    return "llvm.ppc.mma.xvi4ger8";
+  case MMAOp::Xvi4ger8pp:
+    return "llvm.ppc.mma.xvi4ger8pp";
+  case MMAOp::Xvi8ger4:
+    return "llvm.ppc.mma.xvi8ger4";
+  case MMAOp::Xvi8ger4pp:
+    return "llvm.ppc.mma.xvi8ger4pp";
+  case MMAOp::Xvi8ger4spp:
+    return "llvm.ppc.mma.xvi8ger4spp";
   }
   llvm_unreachable("getMmaIrIntrName");
 }
@@ -1850,6 +2417,157 @@ mlir::FunctionType getMmaIrFuncType(mlir::MLIRContext *context, MMAOp mmaOp) {
     return genMmaDisassembleFuncType(context, mmaOp);
   case MMAOp::DisassemblePair:
     return genMmaDisassembleFuncType(context, mmaOp);
+  case MMAOp::Xxmfacc:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 0);
+  case MMAOp::Xxmtacc:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 0);
+  case MMAOp::Xxsetaccz:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 0);
+  case MMAOp::Pmxvbf16ger2:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvbf16ger2nn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvbf16ger2np:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvbf16ger2pn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvbf16ger2pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvf16ger2:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvf16ger2nn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvf16ger2np:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvf16ger2pn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvf16ger2pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvf32ger:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvf32gernn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvf32gernp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvf32gerpn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvf32gerpp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvf64ger:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 1, /*Vector*/ 1,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvf64gernn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvf64gernp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvf64gerpn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvf64gerpp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1,
+                            /*Integer*/ 2);
+  case MMAOp::Pmxvi16ger2:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvi16ger2pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvi16ger2s:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvi16ger2spp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvi4ger8:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvi4ger8pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvi8ger4:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvi8ger4pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Pmxvi8ger4spp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2,
+                            /*Integer*/ 3);
+  case MMAOp::Xvbf16ger2:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvbf16ger2nn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvbf16ger2np:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvbf16ger2pn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvbf16ger2pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf16ger2:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf16ger2nn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf16ger2np:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf16ger2pn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf16ger2pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf32ger:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf32gernn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf32gernp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf32gerpn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf32gerpp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvf64ger:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 1, /*Vector*/ 1);
+  case MMAOp::Xvf64gernn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1);
+  case MMAOp::Xvf64gernp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1);
+  case MMAOp::Xvf64gerpn:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1);
+  case MMAOp::Xvf64gerpp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 1, /*Vector*/ 1);
+  case MMAOp::Xvi16ger2:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvi16ger2pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvi16ger2s:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvi16ger2spp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvi4ger8:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvi4ger8pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvi8ger4:
+    return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvi8ger4pp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
+  case MMAOp::Xvi8ger4spp:
+    return genMmaVqFuncType(context, /*Quad*/ 1, /*Pair*/ 0, /*Vector*/ 2);
   }
   llvm_unreachable("getMmaIrFuncType");
 }

diff  --git a/flang/module/mma.f90 b/flang/module/mma.f90
index f34e6c8fd1a725..d6d2eb87879bc0 100644
--- a/flang/module/mma.f90
+++ b/flang/module/mma.f90
@@ -12,6 +12,12 @@ module mma
 
   abstract interface
 
+!! ========== 1 argument subroutine interface ================================!!
+!! subroutine s(__vector_quad)
+  elemental subroutine sub_vq(acc)
+    __vector_quad, intent(inout) :: acc
+  end subroutine
+
 !! ========== 3 arguments subroutine interface ===============================!!
 !! subroutine s(__vector_pair, vector(i), vector(i))
 #define ELEM_SUB_VPVIVI(VKIND) \
@@ -44,6 +50,62 @@ elemental subroutine sub_vpvr##VKIND##vr##VKIND(pair, arg1, arg2); \
 #undef ELEM_SUB_VPVUVU
 #undef ELEM_SUB_VPVRVR
 
+!! subroutine s(__vector_quad, vector(i), vector(i))
+#define ELEM_SUB_VQVIVI(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vi##VKIND##vi##VKIND(acc, a, b); \
+    __vector_quad, intent(INTENT) :: acc; \
+    vector(integer(VKIND)), intent(in) :: a, b; \
+  end subroutine ;
+
+!! subroutine s(__vector_quad, vector(u), vector(u))
+#define ELEM_SUB_VQVUVU(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vu##VKIND##vu##VKIND(acc, a, b); \
+    __vector_quad, intent(INTENT) :: acc; \
+    vector(unsigned(VKIND)), intent(in) :: a, b; \
+  end subroutine ;
+
+!! subroutine s(__vector_quad, vector(r), vector(r))
+#define ELEM_SUB_VQVRVR(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vr##VKIND##vr##VKIND(acc, a, b); \
+    __vector_quad, intent(INTENT) :: acc; \
+    vector(real(VKIND)), intent(in) :: a, b; \
+  end subroutine ;
+
+  ELEM_SUB_VQVIVI(inout,1) ELEM_SUB_VQVIVI(inout,2)
+  ELEM_SUB_VQVUVU(inout,1)
+  ELEM_SUB_VQVRVR(inout,4)
+  ELEM_SUB_VQVIVI(out,1) ELEM_SUB_VQVIVI(out,2)
+  ELEM_SUB_VQVUVU(out,1)
+  ELEM_SUB_VQVRVR(out,4)
+
+#undef ELEM_SUB_VQVRVR
+#undef ELEM_SUB_VQVUVU
+#undef ELEM_SUB_VQVIVI
+
+!! subroutine s(__vector_quad, __vector_pair, vector(u))
+#define ELEM_SUB_VQVPVU(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vpvu##VKIND(acc, a, b); \
+    __vector_quad, intent(INTENT) :: acc; \
+    __vector_pair, intent(in) :: a; \
+    vector(unsigned(VKIND)), intent(in) :: b; \
+  end subroutine ;
+
+!! subroutine s(__vector_quad, __vector_pair, vector(r))
+#define ELEM_SUB_VQVPVR(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vpvr##VKIND(acc, a, b); \
+    __vector_quad, intent(INTENT) :: acc; \
+    __vector_pair, intent(in) :: a; \
+    vector(real(VKIND)), intent(in) :: b; \
+  end subroutine ;
+
+  ELEM_SUB_VQVPVU(inout,1)
+  ELEM_SUB_VQVPVR(inout,8)
+  ELEM_SUB_VQVPVU(out,1)
+  ELEM_SUB_VQVPVR(out,8)
+
+#undef ELEM_SUB_VQVPVR
+#undef ELEM_SUB_VQVPVU
+
 !! ========== 5 arguments subroutine interface ===============================!!
 !! subroutine s(__vector_quad, vector(i), vector(i), vector(i), vector(i))
 #define ELEM_SUB_VQVIVIVIVI(VKIND) \
@@ -76,6 +138,95 @@ elemental subroutine sub_vqvr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND(acc, arg1,
 #undef ELEM_SUB_VQVUVUVUVU
 #undef ELEM_SUB_VQVIVIVIVI
 
+!! subroutine s(__vector_quad, vector(u), vector(u), integer, integer)
+#define ELEM_SUB_VQVUVUII(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vu##VKIND##vu##VKIND##ii(acc, a, b, xmask, ymask); \
+    __vector_quad, intent(INTENT) :: acc; \
+    vector(unsigned(VKIND)), intent(in) :: a, b; \
+    integer(8), intent(in) :: xmask, ymask; \
+    !dir$ ignore_tkr(k) xmask; \
+    !dir$ ignore_tkr(k) ymask; \
+  end subroutine ;
+
+!! subroutine s(__vector_quad, vector(r), vector(r), integer, integer)
+#define ELEM_SUB_VQVRVRII(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vr##VKIND##vr##VKIND##ii(acc, a, b, xmask, ymask); \
+    __vector_quad, intent(INTENT) :: acc; \
+    vector(real(VKIND)), intent(in) :: a, b; \
+    integer(8), intent(in) :: xmask, ymask; \
+    !dir$ ignore_tkr(k) xmask; \
+    !dir$ ignore_tkr(k) ymask; \
+  end subroutine ;
+
+  ELEM_SUB_VQVUVUII(inout,1)
+  ELEM_SUB_VQVRVRII(inout,4)
+  ELEM_SUB_VQVUVUII(out,1)
+  ELEM_SUB_VQVRVRII(out,4)
+
+#undef ELEM_SUB_VQVRVRII
+#undef ELEM_SUB_VQVUVUII
+
+!! subroutine s(__vector_quad, __vector_pair, vector(u), integer, integer)
+#define ELEM_SUB_VQVPVUII(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vpvu##VKIND##ii(acc, a, b, xmask, ymask); \
+    __vector_quad, intent(INTENT) :: acc; \
+    __vector_pair, intent(in) :: a; \
+    vector(unsigned(VKIND)), intent(in) :: b; \
+    integer(8), intent(in) :: xmask, ymask; \
+    !dir$ ignore_tkr(k) xmask; \
+    !dir$ ignore_tkr(k) ymask; \
+  end subroutine ;
+
+!! subroutine s(__vector_quad, __vector_pair, vector(r), integer, integer)
+#define ELEM_SUB_VQVPVRII(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vpvr##VKIND##ii(acc, a, b, xmask, ymask); \
+    __vector_quad, intent(INTENT) :: acc; \
+    __vector_pair, intent(in) :: a; \
+    vector(real(VKIND)), intent(in) :: b; \
+    integer(8), intent(in) :: xmask, ymask; \
+    !dir$ ignore_tkr(k) xmask; \
+    !dir$ ignore_tkr(k) ymask; \
+  end subroutine ;
+
+  ELEM_SUB_VQVPVUII(inout,1)
+  ELEM_SUB_VQVPVRII(inout,8)
+  ELEM_SUB_VQVPVUII(out,1)
+  ELEM_SUB_VQVPVRII(out,8)
+
+#undef ELEM_SUB_VQVPVRII
+#undef ELEM_SUB_VQVPVUII
+
+!! ========== 6 arguments subroutine interface ===============================!!
+!! subroutine s(__vector_quad, vector(i), vector(i), integer, integer, integer)
+#define ELEM_SUB_VQVIVIIII(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vi##VKIND##vi##VKIND##iii(acc, a, b, xmask, ymask, pmask); \
+    __vector_quad, intent(INTENT) :: acc; \
+    vector(integer(VKIND)), intent(in) :: a, b; \
+    integer(8), intent(in) :: xmask, ymask, pmask; \
+    !dir$ ignore_tkr(k) xmask; \
+    !dir$ ignore_tkr(k) ymask; \
+    !dir$ ignore_tkr(k) pmask; \
+  end subroutine ;
+
+!! subroutine s(__vector_quad, vector(u), vector(u), integer, integer, integer)
+#define ELEM_SUB_VQVUVUIII(INTENT, VKIND) \
+  elemental subroutine sub_vq##INTENT##vu##VKIND##vu##VKIND##iii(acc, a, b, xmask, ymask, pmask); \
+    __vector_quad, intent(INTENT) :: acc; \
+    vector(unsigned(VKIND)), intent(in) :: a, b; \
+    integer(8), intent(in) :: xmask, ymask, pmask; \
+    !dir$ ignore_tkr(k) xmask; \
+    !dir$ ignore_tkr(k) ymask; \
+    !dir$ ignore_tkr(k) pmask; \
+  end subroutine ;
+
+  ELEM_SUB_VQVIVIIII(inout,1) ELEM_SUB_VQVIVIIII(inout,2)
+  ELEM_SUB_VQVUVUIII(inout,1)
+  ELEM_SUB_VQVIVIIII(out,1) ELEM_SUB_VQVIVIIII(out,2)
+  ELEM_SUB_VQVUVUIII(out,1)
+
+#undef ELEM_SUB_VQVUVUIII
+#undef ELEM_SUB_VQVIVIIII
+
 !! ========== non-macro interface =============================================!!
   elemental subroutine sub_atvp(data, pair)
     ! Dummy arg 'data' is supposed to be intent(out) of any type,
@@ -202,6 +353,552 @@ elemental subroutine sub_atvq(data, acc)
 #undef SUB_VP_VU_VU
 #undef SUB_VP_VI_VI
 
+#define SUB_VQ_VI_VI_I_I_I(NAME, VKIND) __ppc_##NAME##_vqvi##VKIND##vi##VKINDi0i0i0
+#define SUB_VQ_VU_VU_I_I_I(NAME, VKIND) __ppc_##NAME##_vqvu##VKIND##vu##VKINDi0i0ii0
+
+#define VEC_SUB_VQ_VI_VI_I_I_I(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vi##VKIND##vi##VKIND##iii) :: SUB_VQ_VI_VI_I_I_I(NAME, VKIND);
+#define VEC_SUB_VQ_VU_VU_I_I_I(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vu##VKIND##vu##VKIND##iii) :: SUB_VQ_VU_VU_I_I_I(NAME, VKIND);
+
+! mma_pmxvbf16ger2
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2_,out,1)
+  interface mma_pmxvbf16ger2
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2_,1)
+  end interface mma_pmxvbf16ger2
+  public mma_pmxvbf16ger2
+
+! mma_pmxvbf16ger2nn
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2nn,inout,1)
+  interface mma_pmxvbf16ger2nn
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2nn,1)
+  end interface mma_pmxvbf16ger2nn
+  public mma_pmxvbf16ger2nn
+
+! mma_pmxvbf16ger2np
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2np,inout,1)
+  interface mma_pmxvbf16ger2np
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2np,1)
+  end interface mma_pmxvbf16ger2np
+  public mma_pmxvbf16ger2np
+
+! mma_pmxvbf16ger2pn
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2pn,inout,1)
+  interface mma_pmxvbf16ger2pn
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2pn,1)
+  end interface mma_pmxvbf16ger2pn
+  public mma_pmxvbf16ger2pn
+
+! mma_pmxvbf16ger2pp
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2pp,inout,1)
+  interface mma_pmxvbf16ger2pp
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvbf16ger2pp,1)
+  end interface mma_pmxvbf16ger2pp
+  public mma_pmxvbf16ger2pp
+
+! mma_pmxvf16ger2
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2_,out,1)
+  interface mma_pmxvf16ger2
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2_,1)
+  end interface mma_pmxvf16ger2
+  public mma_pmxvf16ger2
+
+! mma_pmxvf16ger2nn
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2nn,inout,1)
+  interface mma_pmxvf16ger2nn
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2nn,1)
+  end interface mma_pmxvf16ger2nn
+  public mma_pmxvf16ger2nn
+
+! mma_pmxvf16ger2np
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2np,inout,1)
+  interface mma_pmxvf16ger2np
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2np,1)
+  end interface mma_pmxvf16ger2np
+  public mma_pmxvf16ger2np
+
+! mma_pmxvf16ger2pn
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2pn,inout,1)
+  interface mma_pmxvf16ger2pn
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2pn,1)
+  end interface mma_pmxvf16ger2pn
+  public mma_pmxvf16ger2pn
+
+! mma_pmxvf16ger2pp
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2pp,inout,1)
+  interface mma_pmxvf16ger2pp
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvf16ger2pp,1)
+  end interface mma_pmxvf16ger2pp
+  public mma_pmxvf16ger2pp
+
+! mma_pmxvi16ger2
+  VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2_,out,2)
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2_,out,1)
+  interface mma_pmxvi16ger2
+    procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2_,2)
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2_,1)
+  end interface mma_pmxvi16ger2
+  public mma_pmxvi16ger2
+
+! mma_pmxvi16ger2pp
+  VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2pp,inout,2)
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2pp,inout,1)
+  interface mma_pmxvi16ger2pp
+    procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2pp,2)
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2pp,1)
+  end interface mma_pmxvi16ger2pp
+  public mma_pmxvi16ger2pp
+
+! mma_pmxvi16ger2s
+  VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2s,out,2)
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2s,out,1)
+  interface mma_pmxvi16ger2s
+    procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2s,2)
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2s,1)
+  end interface mma_pmxvi16ger2s
+  public mma_pmxvi16ger2s
+
+! mma_pmxvi16ger2spp
+  VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2spp,inout,2)
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2spp,inout,1)
+  interface mma_pmxvi16ger2spp
+    procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi16ger2spp,2)
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi16ger2spp,1)
+  end interface mma_pmxvi16ger2spp
+  public mma_pmxvi16ger2spp
+
+! mma_pmxvi4ger8
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi4ger8_,out,1)
+  interface mma_pmxvi4ger8
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi4ger8_,1)
+  end interface mma_pmxvi4ger8
+  public mma_pmxvi4ger8
+
+! mma_pmxvi4ger8pp
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi4ger8pp,inout,1)
+  interface mma_pmxvi4ger8pp
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi4ger8pp,1)
+  end interface mma_pmxvi4ger8pp
+  public mma_pmxvi4ger8pp
+
+! mma_pmxvi8ger4
+  VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4_,out,1)
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4_,out,1)
+  interface mma_pmxvi8ger4
+    procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4_,1)
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4_,1)
+  end interface mma_pmxvi8ger4
+  public mma_pmxvi8ger4
+
+! mma_pmxvi8ger4pp
+  VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4pp,inout,1)
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4pp,inout,1)
+  interface mma_pmxvi8ger4pp
+    procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4pp,1)
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4pp,1)
+  end interface mma_pmxvi8ger4pp
+  public mma_pmxvi8ger4pp
+
+! mma_pmxvi8ger4spp
+  VEC_SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4spp,inout,1)
+  VEC_SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4spp,inout,1)
+  interface mma_pmxvi8ger4spp
+    procedure :: SUB_VQ_VI_VI_I_I_I(mma_pmxvi8ger4spp,1)
+    procedure :: SUB_VQ_VU_VU_I_I_I(mma_pmxvi8ger4spp,1)
+  end interface mma_pmxvi8ger4spp
+  public mma_pmxvi8ger4spp
+
+#undef VEC_SUB_VQ_VU_VU_I_I_I
+#undef VEC_SUB_VQ_VI_VI_I_I_I
+#undef SUB_VQ_VU_VU_I_I_I
+#undef SUB_VQ_VI_VI_I_I_I
+
+#define SUB_VQ_VU_VU_I_I(NAME, VKIND) __ppc_##NAME##_vqvu##VKIND##vu##VKINDi0i0
+#define SUB_VQ_VR_VR_I_I(NAME, VKIND) __ppc_##NAME##_vqvr##VKIND##vr##VKINDi0i0
+
+#define VEC_SUB_VQ_VU_VU_I_I(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vu##VKIND##vu##VKIND##ii) :: SUB_VQ_VU_VU_I_I(NAME, VKIND);
+#define VEC_SUB_VQ_VR_VR_I_I(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vr##VKIND##vr##VKIND##ii) :: SUB_VQ_VR_VR_I_I(NAME, VKIND);
+
+! mma_pmxvf32ger
+  VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32ger,out,1)
+  VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32ger,out,4)
+  interface mma_pmxvf32ger
+    procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32ger,1)
+    procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32ger,4)
+  end interface mma_pmxvf32ger
+  public mma_pmxvf32ger
+
+! mma_pmxvf32gernn
+  VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32gernn,inout,1)
+  VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32gernn,inout,4)
+  interface mma_pmxvf32gernn
+    procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32gernn,1)
+    procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32gernn,4)
+  end interface mma_pmxvf32gernn
+  public mma_pmxvf32gernn
+
+! mma_pmxvf32gernp
+  VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32gernp,inout,1)
+  VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32gernp,inout,4)
+  interface mma_pmxvf32gernp
+    procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32gernp,1)
+    procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32gernp,4)
+  end interface mma_pmxvf32gernp
+  public mma_pmxvf32gernp
+
+! mma_pmxvf32gerpn
+  VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32gerpn,inout,1)
+  VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32gerpn,inout,4)
+  interface mma_pmxvf32gerpn
+    procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32gerpn,1)
+    procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32gerpn,4)
+  end interface mma_pmxvf32gerpn
+  public mma_pmxvf32gerpn
+
+! mma_pmxvf32gerpp
+  VEC_SUB_VQ_VU_VU_I_I(mma_pmxvf32gerpp,inout,1)
+  VEC_SUB_VQ_VR_VR_I_I(mma_pmxvf32gerpp,inout,4)
+  interface mma_pmxvf32gerpp
+    procedure :: SUB_VQ_VU_VU_I_I(mma_pmxvf32gerpp,1)
+    procedure :: SUB_VQ_VR_VR_I_I(mma_pmxvf32gerpp,4)
+  end interface mma_pmxvf32gerpp
+  public mma_pmxvf32gerpp
+
+#undef VEC_SUB_VQ_VR_VR_I_I
+#undef VEC_SUB_VQ_VU_VU_I_I
+#undef SUB_VQ_VR_VR_I_I
+#undef SUB_VQ_VU_VU_I_I
+
+#define SUB_VQ_VP_VU_I_I(NAME, VKIND) __ppc_##NAME##_vqvpvu##VKINDi0i0
+#define SUB_VQ_VP_VR_I_I(NAME, VKIND) __ppc_##NAME##_vqvpvr##VKINDi0i0
+
+#define VEC_SUB_VQ_VP_VU_I_I(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vpvu##VKIND##ii) :: SUB_VQ_VP_VU_I_I(NAME, VKIND);
+#define VEC_SUB_VQ_VP_VR_I_I(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vpvr##VKIND##ii) :: SUB_VQ_VP_VR_I_I(NAME, VKIND);
+
+! mma_pmxvf64ger
+  VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64ger,out,1)
+  VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64ger,out,8)
+  interface mma_pmxvf64ger
+    procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64ger,1)
+    procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64ger,8)
+  end interface mma_pmxvf64ger
+  public mma_pmxvf64ger
+
+! mma_pmxvf64gernn
+  VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64gernn,inout,1)
+  VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64gernn,inout,8)
+  interface mma_pmxvf64gernn
+    procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64gernn,1)
+    procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64gernn,8)
+  end interface mma_pmxvf64gernn
+  public mma_pmxvf64gernn
+
+! mma_pmxvf64gernp
+  VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64gernp,inout,1)
+  VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64gernp,inout,8)
+  interface mma_pmxvf64gernp
+    procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64gernp,1)
+    procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64gernp,8)
+  end interface mma_pmxvf64gernp
+  public mma_pmxvf64gernp
+
+! mma_pmxvf64gerpn
+  VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64gerpn,inout,1)
+  VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64gerpn,inout,8)
+  interface mma_pmxvf64gerpn
+    procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64gerpn,1)
+    procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64gerpn,8)
+  end interface mma_pmxvf64gerpn
+  public mma_pmxvf64gerpn
+
+! mma_pmxvf64gerpp
+  VEC_SUB_VQ_VP_VU_I_I(mma_pmxvf64gerpp,inout,1)
+  VEC_SUB_VQ_VP_VR_I_I(mma_pmxvf64gerpp,inout,8)
+  interface mma_pmxvf64gerpp
+    procedure :: SUB_VQ_VP_VU_I_I(mma_pmxvf64gerpp,1)
+    procedure :: SUB_VQ_VP_VR_I_I(mma_pmxvf64gerpp,8)
+  end interface mma_pmxvf64gerpp
+  public mma_pmxvf64gerpp
+
+#undef VEC_SUB_VQ_VP_VR_I_I
+#undef VEC_SUB_VQ_VP_VU_I_I
+#undef SUB_VQ_VP_VR_I_I
+#undef SUB_VQ_VP_VU_I_I
+
+#define SUB_VQ_VI_VI(NAME, VKIND) __ppc_##NAME##_vi##VKIND##vi##VKIND
+#define SUB_VQ_VU_VU(NAME, VKIND) __ppc_##NAME##_vu##VKIND##vu##VKIND
+#define SUB_VQ_VR_VR(NAME, VKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND
+
+#define VEC_SUB_VQ_VI_VI(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vi##VKIND##vi##VKIND) :: SUB_VQ_VI_VI(NAME, VKIND);
+#define VEC_SUB_VQ_VU_VU(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vu##VKIND##vu##VKIND) :: SUB_VQ_VU_VU(NAME, VKIND);
+#define VEC_SUB_VQ_VR_VR(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vr##VKIND##vr##VKIND) :: SUB_VQ_VR_VR(NAME, VKIND);
+
+!! First argument with INTENT(INOUT)
+! mma_xvbf16ger2nn
+  VEC_SUB_VQ_VU_VU(mma_xvbf16ger2nn,inout,1)
+  interface mma_xvbf16ger2nn
+    procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2nn,1)
+  end interface
+  public mma_xvbf16ger2nn
+
+! mma_xvbf16ger2np
+  VEC_SUB_VQ_VU_VU(mma_xvbf16ger2np,inout,1)
+  interface mma_xvbf16ger2np
+    procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2np,1)
+  end interface
+  public mma_xvbf16ger2np
+
+! mma_xvbf16ger2pn
+  VEC_SUB_VQ_VU_VU(mma_xvbf16ger2pn,inout,1)
+  interface mma_xvbf16ger2pn
+    procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2pn,1)
+  end interface
+  public mma_xvbf16ger2pn
+
+! mma_xvbf16ger2pp
+  VEC_SUB_VQ_VU_VU(mma_xvbf16ger2pp,inout,1)
+  interface mma_xvbf16ger2pp
+    procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2pp,1)
+  end interface
+  public mma_xvbf16ger2pp
+
+! mma_xvi8ger4pp
+  VEC_SUB_VQ_VI_VI(mma_xvi8ger4pp,inout,1)
+  VEC_SUB_VQ_VU_VU(mma_xvi8ger4pp,inout,1)
+  interface mma_xvi8ger4pp
+    procedure :: SUB_VQ_VI_VI(mma_xvi8ger4pp,1)
+    procedure :: SUB_VQ_VU_VU(mma_xvi8ger4pp,1)
+  end interface
+  public mma_xvi8ger4pp
+
+! mma_xvi8ger4spp
+  VEC_SUB_VQ_VI_VI(mma_xvi8ger4spp,inout,1)
+  VEC_SUB_VQ_VU_VU(mma_xvi8ger4spp,inout,1)
+  interface mma_xvi8ger4spp
+    procedure :: SUB_VQ_VI_VI(mma_xvi8ger4spp,1)
+    procedure :: SUB_VQ_VU_VU(mma_xvi8ger4spp,1)
+  end interface
+  public mma_xvi8ger4spp
+
+! mma_xvi16ger2pp
+  VEC_SUB_VQ_VI_VI(mma_xvi16ger2pp,inout,2)
+  VEC_SUB_VQ_VU_VU(mma_xvi16ger2pp,inout,1)
+  interface mma_xvi16ger2pp
+    procedure :: SUB_VQ_VI_VI(mma_xvi16ger2pp,2)
+    procedure :: SUB_VQ_VU_VU(mma_xvi16ger2pp,1)
+  end interface
+  public mma_xvi16ger2pp
+
+! mma_xvi16ger2s
+  VEC_SUB_VQ_VI_VI(mma_xvi16ger2s,inout,2)
+  VEC_SUB_VQ_VU_VU(mma_xvi16ger2s,inout,1)
+  interface mma_xvi16ger2s
+    procedure :: SUB_VQ_VI_VI(mma_xvi16ger2s,2)
+    procedure :: SUB_VQ_VU_VU(mma_xvi16ger2s,1)
+  end interface
+  public mma_xvi16ger2s
+
+! mma_xvi16ger2spp
+  VEC_SUB_VQ_VI_VI(mma_xvi16ger2spp,inout,2)
+  VEC_SUB_VQ_VU_VU(mma_xvi16ger2spp,inout,1)
+  interface mma_xvi16ger2spp
+    procedure :: SUB_VQ_VI_VI(mma_xvi16ger2spp,2)
+    procedure :: SUB_VQ_VU_VU(mma_xvi16ger2spp,1)
+  end interface
+  public mma_xvi16ger2spp
+
+! mma_xvi4ger8pp
+  VEC_SUB_VQ_VU_VU(mma_xvi4ger8pp,inout,1)
+  interface mma_xvi4ger8pp
+    procedure :: SUB_VQ_VU_VU(mma_xvi4ger8pp,1)
+  end interface
+  public mma_xvi4ger8pp
+
+! mma_xvf16ger2nn
+  VEC_SUB_VQ_VU_VU(mma_xvf16ger2nn,inout,1)
+  interface mma_xvf16ger2nn
+    procedure :: SUB_VQ_VU_VU(mma_xvf16ger2nn,1)
+  end interface
+  public mma_xvf16ger2nn
+
+! mma_xvf16ger2np
+  VEC_SUB_VQ_VU_VU(mma_xvf16ger2np,inout,1)
+  interface mma_xvf16ger2np
+    procedure :: SUB_VQ_VU_VU(mma_xvf16ger2np,1)
+  end interface
+  public mma_xvf16ger2np
+
+! mma_xvf16ger2pn
+  VEC_SUB_VQ_VU_VU(mma_xvf16ger2pn,inout,1)
+  interface mma_xvf16ger2pn
+    procedure :: SUB_VQ_VU_VU(mma_xvf16ger2pn,1)
+  end interface
+  public mma_xvf16ger2pn
+
+! mma_xvf16ger2pp
+  VEC_SUB_VQ_VU_VU(mma_xvf16ger2pp,inout,1)
+  interface mma_xvf16ger2pp
+    procedure :: SUB_VQ_VU_VU(mma_xvf16ger2pp,1)
+  end interface
+  public mma_xvf16ger2pp
+
+! mma_xvf32gernn
+  VEC_SUB_VQ_VU_VU(mma_xvf32gernn,inout,1)
+  VEC_SUB_VQ_VR_VR(mma_xvf32gernn,inout,4)
+  interface mma_xvf32gernn
+    procedure :: SUB_VQ_VU_VU(mma_xvf32gernn,1)
+    procedure :: SUB_VQ_VR_VR(mma_xvf32gernn,4)
+  end interface
+  public mma_xvf32gernn
+
+! mma_xvf32gernp
+  VEC_SUB_VQ_VU_VU(mma_xvf32gernp,inout,1)
+  VEC_SUB_VQ_VR_VR(mma_xvf32gernp,inout,4)
+  interface mma_xvf32gernp
+    procedure :: SUB_VQ_VU_VU(mma_xvf32gernp,1)
+    procedure :: SUB_VQ_VR_VR(mma_xvf32gernp,4)
+  end interface
+  public mma_xvf32gernp
+
+! mma_xvf32gerpn
+  VEC_SUB_VQ_VU_VU(mma_xvf32gerpn,inout,1)
+  VEC_SUB_VQ_VR_VR(mma_xvf32gerpn,inout,4)
+  interface mma_xvf32gerpn
+    procedure :: SUB_VQ_VU_VU(mma_xvf32gerpn,1)
+    procedure :: SUB_VQ_VR_VR(mma_xvf32gerpn,4)
+  end interface
+  public mma_xvf32gerpn
+
+! mma_xvf32gerpp
+  VEC_SUB_VQ_VU_VU(mma_xvf32gerpp,inout,1)
+  VEC_SUB_VQ_VR_VR(mma_xvf32gerpp,inout,4)
+  interface mma_xvf32gerpp
+    procedure :: SUB_VQ_VU_VU(mma_xvf32gerpp,1)
+    procedure :: SUB_VQ_VR_VR(mma_xvf32gerpp,4)
+  end interface
+  public mma_xvf32gerpp
+
+!! First argument with INTENT(OUT)
+! mma_xvbf16ger2
+  VEC_SUB_VQ_VU_VU(mma_xvbf16ger2_,out,1)
+  interface mma_xvbf16ger2
+    procedure :: SUB_VQ_VU_VU(mma_xvbf16ger2_,1)
+  end interface
+  public mma_xvbf16ger2
+
+! mma_xvi16ger2
+  VEC_SUB_VQ_VI_VI(mma_xvi16ger2_,out,2)
+  VEC_SUB_VQ_VU_VU(mma_xvi16ger2_,out,1)
+  interface mma_xvi16ger2
+    procedure :: SUB_VQ_VI_VI(mma_xvi16ger2_,2)
+    procedure :: SUB_VQ_VU_VU(mma_xvi16ger2_,1)
+  end interface
+  public mma_xvi16ger2
+
+! mma_xvi4ger8
+  VEC_SUB_VQ_VU_VU(mma_xvi4ger8_,out,1)
+  interface mma_xvi4ger8
+    procedure :: SUB_VQ_VU_VU(mma_xvi4ger8_,1)
+  end interface
+  public mma_xvi4ger8
+
+! mma_xvi8ger4
+  VEC_SUB_VQ_VI_VI(mma_xvi8ger4_,out,1)
+  VEC_SUB_VQ_VU_VU(mma_xvi8ger4_,out,1)
+  interface mma_xvi8ger4
+    procedure :: SUB_VQ_VI_VI(mma_xvi8ger4_,1)
+    procedure :: SUB_VQ_VU_VU(mma_xvi8ger4_,1)
+  end interface
+  public mma_xvi8ger4
+
+! mma_xvf16ger2
+  VEC_SUB_VQ_VU_VU(mma_xvf16ger2_,out,1)
+  interface mma_xvf16ger2
+    procedure :: SUB_VQ_VU_VU(mma_xvf16ger2_,1)
+  end interface
+  public mma_xvf16ger2
+
+! mma_xvf32ger
+  VEC_SUB_VQ_VU_VU(mma_xvf32ger,out,1)
+  VEC_SUB_VQ_VR_VR(mma_xvf32ger,out,4)
+  interface mma_xvf32ger
+    procedure :: SUB_VQ_VU_VU(mma_xvf32ger,1)
+    procedure :: SUB_VQ_VR_VR(mma_xvf32ger,4)
+  end interface
+  public mma_xvf32ger
+
+#undef VEC_SUB_VQ_VR_VR
+#undef VEC_SUB_VQ_VU_VU
+#undef VEC_SUB_VQ_VI_VI
+#undef SUB_VQ_VR_VR
+#undef SUB_VQ_VU_VU
+#undef SUB_VQ_VI_VI
+
+#define SUB_VQ_VP_VU(NAME, VKIND) __ppc_##NAME##_vu##VKIND
+#define SUB_VQ_VP_VR(NAME, VKIND) __ppc_##NAME##_vr##VKIND
+
+#define VEC_SUB_VQ_VP_VU(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vpvu##VKIND) :: SUB_VQ_VP_VU(NAME, VKIND);
+#define VEC_SUB_VQ_VP_VR(NAME, INTENT, VKIND) \
+  procedure(sub_vq##INTENT##vpvr##VKIND) :: SUB_VQ_VP_VR(NAME, VKIND);
+
+! mma_xvf64ger
+  VEC_SUB_VQ_VP_VU(mma_xvf64ger,out,1)
+  VEC_SUB_VQ_VP_VR(mma_xvf64ger,out,8)
+  interface mma_xvf64ger
+    procedure :: SUB_VQ_VP_VU(mma_xvf64ger,1)
+    procedure :: SUB_VQ_VP_VR(mma_xvf64ger,8)
+  end interface
+  public mma_xvf64ger
+
+! mma_xvf64gernn
+  VEC_SUB_VQ_VP_VU(mma_xvf64gernn,inout,1)
+  VEC_SUB_VQ_VP_VR(mma_xvf64gernn,inout,8)
+  interface mma_xvf64gernn
+    procedure :: SUB_VQ_VP_VU(mma_xvf64gernn,1)
+    procedure :: SUB_VQ_VP_VR(mma_xvf64gernn,8)
+  end interface
+  public mma_xvf64gernn
+
+! mma_xvf64gernp
+  VEC_SUB_VQ_VP_VU(mma_xvf64gernp,inout,1)
+  VEC_SUB_VQ_VP_VR(mma_xvf64gernp,inout,8)
+  interface mma_xvf64gernp
+    procedure :: SUB_VQ_VP_VU(mma_xvf64gernp,1)
+    procedure :: SUB_VQ_VP_VR(mma_xvf64gernp,8)
+  end interface
+  public mma_xvf64gernp
+
+! mma_xvf64gerpn
+  VEC_SUB_VQ_VP_VU(mma_xvf64gerpn,inout,1)
+  VEC_SUB_VQ_VP_VR(mma_xvf64gerpn,inout,8)
+  interface mma_xvf64gerpn
+    procedure :: SUB_VQ_VP_VU(mma_xvf64gerpn,1)
+    procedure :: SUB_VQ_VP_VR(mma_xvf64gerpn,8)
+  end interface
+  public mma_xvf64gerpn
+
+! mma_xvf64gerpp
+  VEC_SUB_VQ_VP_VU(mma_xvf64gerpp,inout,1)
+  VEC_SUB_VQ_VP_VR(mma_xvf64gerpp,inout,8)
+  interface mma_xvf64gerpp
+    procedure :: SUB_VQ_VP_VU(mma_xvf64gerpp,1)
+    procedure :: SUB_VQ_VP_VR(mma_xvf64gerpp,8)
+  end interface
+  public mma_xvf64gerpp
+
+#undef VEC_SUB_VQ_VP_VR
+#undef VEC_SUB_VQ_VP_VU
+#undef SUB_VQ_VP_VR
+#undef SUB_VQ_VP_VU
+
 ! mma_disassemble_acc
   procedure(sub_atvq) :: __ppc_mma_disassemble_acc
   interface mma_disassemble_acc
@@ -216,5 +913,25 @@ elemental subroutine sub_atvq(data, acc)
   end interface
   public mma_disassemble_pair
 
-end module
+! mma_xxmfacc
+  procedure(sub_vq) :: __ppc_mma_xxmfacc
+  interface mma_xxmfacc
+    procedure :: __ppc_mma_xxmfacc
+  end interface
+  public mma_xxmfacc
 
+! mma_xxmtacc
+  procedure(sub_vq) :: __ppc_mma_xxmtacc
+  interface mma_xxmtacc
+    procedure :: __ppc_mma_xxmtacc
+  end interface
+  public mma_xxmtacc
+
+! mma_xxsetaccz
+  procedure(sub_vq) :: __ppc_mma_xxsetaccz
+  interface mma_xxsetaccz
+    procedure :: __ppc_mma_xxsetaccz
+  end interface
+  public mma_xxsetaccz
+
+end module

diff  --git a/flang/test/Lower/PowerPC/ppc-mma-accumulator-move-clear.f90 b/flang/test/Lower/PowerPC/ppc-mma-accumulator-move-clear.f90
new file mode 100644
index 00000000000000..cc9689b70343c6
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-mma-accumulator-move-clear.f90
@@ -0,0 +1,40 @@
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+      subroutine test_xxmfacc()
+      use, intrinsic :: mma
+      implicit none
+      __vector_quad :: cq
+      call mma_xxmfacc(cq)
+      end subroutine test_xxmfacc
+
+!CHECK-LABEL: @test_xxmfacc_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %3 = call <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1> %2)
+!CHECK:  store <512 x i1> %3, ptr %1, align 64
+
+      subroutine test_xxmtacc()
+      use, intrinsic :: mma
+      implicit none
+      __vector_quad :: cq
+      call mma_xxmtacc(cq)
+      end subroutine test_xxmtacc
+
+!CHECK-LABEL: @test_xxmtacc_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %3 = call <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1> %2)
+!CHECK:  store <512 x i1> %3, ptr %1, align 64
+
+      subroutine test_xxsetaccz()
+      use, intrinsic :: mma
+      implicit none
+      __vector_quad :: cq
+      call mma_xxsetaccz(cq)
+      end subroutine test_xxsetaccz
+
+!CHECK-LABEL: @test_xxsetaccz_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+!CHECK:  store <512 x i1> %2, ptr %1, align 64

diff  --git a/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90 b/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90
index 673ec4b846354f..1ae6c5305345f3 100644
--- a/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90
+++ b/flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90
@@ -1,4 +1,4 @@
-! RUN: %flang --target=powerpc64le-unknown-linux-gnu -mcpu=pwr10 -emit-llvm -S %s -o - | FileCheck --check-prefixes="CHECK" %s
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 ! mma_assemble_acc

diff  --git a/flang/test/Lower/PowerPC/ppc-mma-outer-product-1.f90 b/flang/test/Lower/PowerPC/ppc-mma-outer-product-1.f90
new file mode 100644
index 00000000000000..96c7d65a1817ab
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-mma-outer-product-1.f90
@@ -0,0 +1,1701 @@
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+      subroutine test_pmxvbf16ger2_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvbf16ger2_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+
+      subroutine test_pmxvbf16ger2_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvbf16ger2_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+
+      subroutine test_pmxvbf16ger2nn_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2nn(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvbf16ger2nn_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2nn_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvbf16ger2nn_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2nn(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvbf16ger2nn_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2nn_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvbf16ger2np_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2np(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvbf16ger2np_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2np_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvbf16ger2np_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2np(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvbf16ger2np_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2np_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvbf16ger2pn_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2pn(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvbf16ger2pn_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2pn_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvbf16ger2pn_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2pn(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvbf16ger2pn_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2pn_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvbf16ger2pp_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2pp(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvbf16ger2pp_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2pp_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvbf16ger2pp_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvbf16ger2pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvbf16ger2pp_non_def
+
+!CHECK-LABEL: @test_pmxvbf16ger2pp_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvf16ger2_def
+
+!CHECK-LABEL: @test_pmxvf16ger2_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvf16ger2_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2nn_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2nn(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvf16ger2nn_def
+
+!CHECK-LABEL: @test_pmxvf16ger2nn_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2nn_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2nn(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvf16ger2nn_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2nn_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2np_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2np(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvf16ger2np_def
+
+!CHECK-LABEL: @test_pmxvf16ger2np_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2np_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2np(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvf16ger2np_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2np_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2pn_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2pn(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvf16ger2pn_def
+
+!CHECK-LABEL: @test_pmxvf16ger2pn_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2pn_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2pn(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvf16ger2pn_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2pn_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2pp_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2pp(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvf16ger2pp_def
+
+!CHECK-LABEL: @test_pmxvf16ger2pp_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf16ger2pp_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf16ger2pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvf16ger2pp_non_def
+
+!CHECK-LABEL: @test_pmxvf16ger2pp_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf32ger_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32ger(cq, vu10, vu11, 7, 2)
+      end subroutine test_pmxvf32ger_u1_def
+
+!CHECK-LABEL: @test_pmxvf32ger_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvf32ger_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32ger(cq, vu10, vu11, 7_2, 2_1)
+      end subroutine test_pmxvf32ger_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32ger_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvf32ger_r4_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32ger(cq, vr40, vr41, 7, 2)
+      end subroutine test_pmxvf32ger_r4_def
+
+!CHECK-LABEL: @test_pmxvf32ger_r4_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %7 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %6, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %1, align 64
+
+      subroutine test_pmxvf32ger_r4_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32ger(cq, vr40, vr41, 7_2, 2_1)
+      end subroutine test_pmxvf32ger_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32ger_r4_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %7 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %6, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %1, align 64
+
+      subroutine test_pmxvf32gernn_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32gernn(cq, vu10, vu11, 7, 2)
+      end subroutine test_pmxvf32gernn_u1_def
+
+!CHECK-LABEL: @test_pmxvf32gernn_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf32gernn_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32gernn(cq, vu10, vu11, 7_2, 2_1)
+      end subroutine test_pmxvf32gernn_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32gernn_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf32gernn_r4_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32gernn(cq, vr40, vr41, 7, 2)
+      end subroutine test_pmxvf32gernn_r4_def
+
+!CHECK-LABEL: @test_pmxvf32gernn_r4_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvf32gernn_r4_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32gernn(cq, vr40, vr41, 7_2, 2_1)
+      end subroutine test_pmxvf32gernn_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32gernn_r4_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvf32gernp_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32gernp(cq, vu10, vu11, 7, 2)
+      end subroutine test_pmxvf32gernp_u1_def
+
+!CHECK-LABEL: @test_pmxvf32gernp_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf32gernp_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32gernp(cq, vu10, vu11, 7_2, 2_1)
+      end subroutine test_pmxvf32gernp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32gernp_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf32gernp_r4_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32gernp(cq, vr40, vr41, 7, 2)
+      end subroutine test_pmxvf32gernp_r4_def
+
+!CHECK-LABEL: @test_pmxvf32gernp_r4_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvf32gernp_r4_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32gernp(cq, vr40, vr41, 7_2, 2_1)
+      end subroutine test_pmxvf32gernp_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32gernp_r4_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvf32gerpn_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32gerpn(cq, vu10, vu11, 7, 2)
+      end subroutine test_pmxvf32gerpn_u1_def
+
+!CHECK-LABEL: @test_pmxvf32gerpn_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf32gerpn_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32gerpn(cq, vu10, vu11, 7_2, 2_1)
+      end subroutine test_pmxvf32gerpn_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32gerpn_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf32gerpn_r4_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32gerpn(cq, vr40, vr41, 7, 2)
+      end subroutine test_pmxvf32gerpn_r4_def
+
+!CHECK-LABEL: @test_pmxvf32gerpn_r4_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvf32gerpn_r4_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32gerpn(cq, vr40, vr41, 7_2, 2_1)
+      end subroutine test_pmxvf32gerpn_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32gerpn_r4_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvf32gerpp_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32gerpp(cq, vu10, vu11, 7, 2)
+      end subroutine test_pmxvf32gerpp_u1_def
+
+!CHECK-LABEL: @test_pmxvf32gerpp_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf32gerpp_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvf32gerpp(cq, vu10, vu11, 7_2, 2_1)
+      end subroutine test_pmxvf32gerpp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf32gerpp_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvf32gerpp_r4_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32gerpp(cq, vr40, vr41, 7, 2)
+      end subroutine test_pmxvf32gerpp_r4_def
+
+!CHECK-LABEL: @test_pmxvf32gerpp_r4_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvf32gerpp_r4_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_pmxvf32gerpp(cq, vr40, vr41, 7_2, 2_1)
+      end subroutine test_pmxvf32gerpp_r4_non_def
+
+!CHECK-LABEL: @test_pmxvf32gerpp_r4_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvf64ger_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64ger(cq, cp, vu10, 7, 2)
+      end subroutine test_pmxvf64ger_u1_def
+
+!CHECK-LABEL: @test_pmxvf64ger_u1_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %2, align 64
+
+      subroutine test_pmxvf64ger_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64ger(cq, cp, vu10, 7_2, 2_1)
+      end subroutine test_pmxvf64ger_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64ger_u1_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %2, align 64
+
+      subroutine test_pmxvf64ger_r8_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64ger(cq, cp, vr80, 7, 2)
+      end subroutine test_pmxvf64ger_r8_def
+
+!CHECK-LABEL: @test_pmxvf64ger_r8_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %4, <16 x i8> %6, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64ger_r8_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64ger(cq, cp, vr80, 7_2, 2_1)
+      end subroutine test_pmxvf64ger_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64ger_r8_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %4, <16 x i8> %6, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64gernn_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gernn(cq, cp, vu10, 7, 2)
+      end subroutine test_pmxvf64gernn_u1_def
+
+!CHECK-LABEL: @test_pmxvf64gernn_u1_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64gernn_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gernn(cq, cp, vu10, 7_2, 2_1)
+      end subroutine test_pmxvf64gernn_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64gernn_u1_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64gernn_r8_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gernn(cq, cp, vr80, 7, 2)
+      end subroutine test_pmxvf64gernn_r8_def
+
+!CHECK-LABEL: @test_pmxvf64gernn_r8_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_pmxvf64gernn_r8_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gernn(cq, cp, vr80, 7_2, 2_1)
+      end subroutine test_pmxvf64gernn_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64gernn_r8_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_pmxvf64gernp_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gernp(cq, cp, vu10, 7, 2)
+      end subroutine test_pmxvf64gernp_u1_def
+
+!CHECK-LABEL: @test_pmxvf64gernp_u1_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64gernp_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gernp(cq, cp, vu10, 7_2, 2_1)
+      end subroutine test_pmxvf64gernp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64gernp_u1_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64gernp_r8_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gernp(cq, cp, vr80, 7, 2)
+      end subroutine test_pmxvf64gernp_r8_def
+
+!CHECK-LABEL: @test_pmxvf64gernp_r8_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_pmxvf64gernp_r8_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gernp(cq, cp, vr80, 7_2, 2_1)
+      end subroutine test_pmxvf64gernp_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64gernp_r8_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_pmxvf64gerpn_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gerpn(cq, cp, vu10, 7, 2)
+      end subroutine test_pmxvf64gerpn_u1_def
+
+!CHECK-LABEL: @test_pmxvf64gerpn_u1_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64gerpn_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gerpn(cq, cp, vu10, 7_2, 2_1)
+      end subroutine test_pmxvf64gerpn_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64gerpn_u1_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64gerpn_r8_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gerpn(cq, cp, vr80, 7, 2)
+      end subroutine test_pmxvf64gerpn_r8_def
+
+!CHECK-LABEL: @test_pmxvf64gerpn_r8_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_pmxvf64gerpn_r8_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gerpn(cq, cp, vr80, 7_2, 2_1)
+      end subroutine test_pmxvf64gerpn_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64gerpn_r8_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_pmxvf64gerpp_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gerpp(cq, cp, vu10, 7, 2)
+      end subroutine test_pmxvf64gerpp_u1_def
+
+!CHECK-LABEL: @test_pmxvf64gerpp_u1_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64gerpp_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gerpp(cq, cp, vu10, 7_2, 2_1)
+      end subroutine test_pmxvf64gerpp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvf64gerpp_u1_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_pmxvf64gerpp_r8_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gerpp(cq, cp, vr80, 7, 2)
+      end subroutine test_pmxvf64gerpp_r8_def
+
+!CHECK-LABEL: @test_pmxvf64gerpp_r8_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_pmxvf64gerpp_r8_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_pair :: cp
+      __vector_quad :: cq
+      call mma_pmxvf64gerpp(cq, cp, vr80, 7_2, 2_1)
+      end subroutine test_pmxvf64gerpp_r8_non_def
+
+!CHECK-LABEL: @test_pmxvf64gerpp_r8_non_def_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_pmxvi16ger2_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi16ger2(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvi16ger2_u1_def
+
+!CHECK-LABEL: @test_pmxvi16ger2_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi16ger2(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi16ger2_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2_i2_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_pmxvi16ger2(cq, vi20, vi21, 7, 7, 2)
+      end subroutine test_pmxvi16ger2_i2_def
+
+!CHECK-LABEL: @test_pmxvi16ger2_i2_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:  %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:  %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:  %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %6, <16 x i8> %7, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2_i2_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_pmxvi16ger2(cq, vi20, vi21, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi16ger2_i2_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2_i2_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:  %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:  %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:  %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %6, <16 x i8> %7, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2pp_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi16ger2pp(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvi16ger2pp_u1_def
+
+!CHECK-LABEL: @test_pmxvi16ger2pp_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2pp_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi16ger2pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi16ger2pp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2pp_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2pp_i2_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_pmxvi16ger2pp(cq, vi20, vi21, 7, 7, 2)
+      end subroutine test_pmxvi16ger2pp_i2_def
+
+!CHECK-LABEL: @test_pmxvi16ger2pp_i2_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:  %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2pp_i2_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_pmxvi16ger2pp(cq, vi20, vi21, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi16ger2pp_i2_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2pp_i2_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:  %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2s_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi16ger2s(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvi16ger2s_u1_def
+
+!CHECK-LABEL: @test_pmxvi16ger2s_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2s_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi16ger2s(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi16ger2s_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2s_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2s_i2_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_pmxvi16ger2s(cq, vi20, vi21, 7, 7, 2)
+      end subroutine test_pmxvi16ger2s_i2_def
+
+!CHECK-LABEL: @test_pmxvi16ger2s_i2_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:  %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:  %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:  %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %6, <16 x i8> %7, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2s_i2_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_pmxvi16ger2s(cq, vi20, vi21, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi16ger2s_i2_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2s_i2_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:  %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:  %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:  %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %6, <16 x i8> %7, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %8, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2spp_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi16ger2spp(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvi16ger2spp_u1_def
+
+!CHECK-LABEL: @test_pmxvi16ger2spp_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2spp_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi16ger2spp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi16ger2spp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2spp_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2spp_i2_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_pmxvi16ger2spp(cq, vi20, vi21, 7, 7, 2)
+      end subroutine test_pmxvi16ger2spp_i2_def
+
+!CHECK-LABEL: @test_pmxvi16ger2spp_i2_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:  %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_pmxvi16ger2spp_i2_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_pmxvi16ger2spp(cq, vi20, vi21, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi16ger2spp_i2_non_def
+
+!CHECK-LABEL: @test_pmxvi16ger2spp_i2_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:  %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+
+      subroutine test_pmxvi4ger8_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi4ger8(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvi4ger8_def
+
+!CHECK-LABEL: @test_pmxvi4ger8_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi4ger8_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi4ger8(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi4ger8_non_def
+
+!CHECK-LABEL: @test_pmxvi4ger8_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi4ger8pp_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi4ger8pp(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvi4ger8pp_def
+
+!CHECK-LABEL: @test_pmxvi4ger8pp_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi4ger8pp_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi4ger8pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi4ger8pp_non_def
+
+!CHECK-LABEL: @test_pmxvi4ger8pp_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvi8ger4_u1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi8ger4_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4_i1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(1)) vi10, vi11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4(cq, vi10, vi11, 7, 7, 2)
+      end subroutine test_pmxvi8ger4_i1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4_i1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4_i1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(1)) vi10, vi11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4(cq, vi10, vi11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi8ger4_i1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4_i1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4pp_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4pp(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvi8ger4pp_u1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4pp_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4pp_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4pp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi8ger4pp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4pp_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4pp_i1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(1)) vi10, vi11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4pp(cq, vi10, vi11, 7, 7, 2)
+      end subroutine test_pmxvi8ger4pp_i1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4pp_i1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4pp_i1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(1)) vi10, vi11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4pp(cq, vi10, vi11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi8ger4pp_i1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4pp_i1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4spp_u1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4spp(cq, vu10, vu11, 7, 7, 2)
+      end subroutine test_pmxvi8ger4spp_u1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4spp_u1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4spp_u1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4spp(cq, vu10, vu11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi8ger4spp_u1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4spp_u1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4spp_i1_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(1)) vi10, vi11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4spp(cq, vi10, vi11, 7, 7, 2)
+      end subroutine test_pmxvi8ger4spp_i1_def
+
+!CHECK-LABEL: @test_pmxvi8ger4spp_i1_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_pmxvi8ger4spp_i1_non_def()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(1)) vi10, vi11
+      __vector_quad :: cq
+      call mma_pmxvi8ger4spp(cq, vi10, vi11, 7_2, 7_1, 2_8)
+      end subroutine test_pmxvi8ger4spp_i1_non_def
+
+!CHECK-LABEL: @test_pmxvi8ger4spp_i1_non_def_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5, i32 7, i32 7, i32 2)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64

diff  --git a/flang/test/Lower/PowerPC/ppc-mma-outer-product-2.f90 b/flang/test/Lower/PowerPC/ppc-mma-outer-product-2.f90
new file mode 100644
index 00000000000000..778d58a745be9d
--- /dev/null
+++ b/flang/test/Lower/PowerPC/ppc-mma-outer-product-2.f90
@@ -0,0 +1,856 @@
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr10 -emit-llvm %s -o - | FileCheck --check-prefixes="CHECK" %s
+! REQUIRES: target=powerpc{{.*}}
+
+      subroutine test_xvbf16ger2()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvbf16ger2(cq, vu10, vu11)
+      end subroutine test_xvbf16ger2
+
+!CHECK-LABEL: @test_xvbf16ger2_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:   %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:   %6 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2(<16 x i8> %4, <16 x i8> %5)
+!CHECK:   store <512 x i1> %6, ptr %1, align 64
+
+
+      subroutine test_xvbf16ger2nn()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvbf16ger2nn(cq, vu10, vu11)
+      end subroutine test_xvbf16ger2nn
+
+!CHECK-LABEL: @test_xvbf16ger2nn_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvbf16ger2np()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvbf16ger2np(cq, vu10, vu11)
+      end subroutine test_xvbf16ger2np
+
+!CHECK-LABEL: @test_xvbf16ger2np_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvbf16ger2pn()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvbf16ger2pn(cq, vu10, vu11)
+      end subroutine test_xvbf16ger2pn
+
+!CHECK-LABEL: @test_xvbf16ger2pn_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvbf16ger2pp()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvbf16ger2pp(cq, vu10, vu11)
+      end subroutine test_xvbf16ger2pp
+
+!CHECK-LABEL: @test_xvbf16ger2pp_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvbf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvf16ger2()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf16ger2(cq, vu10, vu11)
+      end subroutine test_xvf16ger2
+
+!CHECK-LABEL: @test_xvf16ger2_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_xvf16ger2nn()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf16ger2nn(cq, vu10, vu11)
+      end subroutine test_xvf16ger2nn
+
+!CHECK-LABEL: @test_xvf16ger2nn_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvf16ger2np()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf16ger2np(cq, vu10, vu11)
+      end subroutine test_xvf16ger2np
+
+!CHECK-LABEL: @test_xvf16ger2np_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvf16ger2pn()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf16ger2pn(cq, vu10, vu11)
+      end subroutine test_xvf16ger2pn
+
+!CHECK-LABEL: @test_xvf16ger2pn_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvf16ger2pp()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf16ger2pp(cq, vu10, vu11)
+      end subroutine test_xvf16ger2pp
+
+!CHECK-LABEL: @test_xvf16ger2pp_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvf32ger_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf32ger(cq, vu10, vu11)
+      end subroutine test_xvf32ger_u1
+
+!CHECK-LABEL: @test_xvf32ger_u1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+
+      subroutine test_xvf32ger_r4()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_xvf32ger(cq, vr40, vr41)
+      end subroutine test_xvf32ger_r4
+
+!CHECK-LABEL: @test_xvf32ger_r4_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %7 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %6, <16 x i8> %7)
+!CHECK:  store <512 x i1> %8, ptr %1, align 64
+
+      subroutine test_xvf32gernn_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf32gernn(cq, vu10, vu11)
+      end subroutine test_xvf32gernn_u1
+
+!CHECK-LABEL: @test_xvf32gernn_u1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvf32gernn_r4()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_xvf32gernn(cq, vr40, vr41)
+      end subroutine test_xvf32gernn_r4
+
+!CHECK-LABEL: @test_xvf32gernn_r4_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_xvf32gernp_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf32gernp(cq, vu10, vu11)
+      end subroutine test_xvf32gernp_u1
+
+!CHECK-LABEL: @test_xvf32gernp_u1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvf32gernp_r4()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_xvf32gernp(cq, vr40, vr41)
+      end subroutine test_xvf32gernp_r4
+
+!CHECK-LABEL: @test_xvf32gernp_r4_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_xvf32gerpn_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf32gerpn(cq, vu10, vu11)
+      end subroutine test_xvf32gerpn_u1
+
+!CHECK-LABEL: @test_xvf32gerpn_u1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvf32gerpn_r4()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_xvf32gerpn(cq, vr40, vr41)
+      end subroutine test_xvf32gerpn_r4
+
+!CHECK-LABEL: @test_xvf32gerpn_r4_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_xvf32gerpp_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvf32gerpp(cq, vu10, vu11)
+      end subroutine test_xvf32gerpp_u1
+
+!CHECK-LABEL: @test_xvf32gerpp_u1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+
+      subroutine test_xvf32gerpp_r4()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(4)) vr40, vr41
+      __vector_quad :: cq
+      call mma_xvf32gerpp(cq, vr40, vr41)
+      end subroutine test_xvf32gerpp_r4
+
+!CHECK-LABEL: @test_xvf32gerpp_r4_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %3 = alloca <4 x float>, i64 1, align 16
+!CHECK:  %4 = load <4 x float>, ptr %2, align 16
+!CHECK:  %5 = load <4 x float>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = bitcast <4 x float> %4 to <16 x i8>
+!CHECK:  %8 = bitcast <4 x float> %5 to <16 x i8>
+!CHECK:  %9 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK:  store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_xvf64ger_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64ger(cq, cp, vu10)
+      end subroutine test_xvf64ger_u1
+
+!CHECK-LABEL: @test_xvf64ger_u1_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %6, ptr %2, align 64
+
+      subroutine test_xvf64ger_r8()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64ger(cq, cp, vr80)
+      end subroutine test_xvf64ger_r8
+
+!CHECK-LABEL: @test_xvf64ger_r8_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %4, <16 x i8> %6)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+
+      subroutine test_xvf64gernn_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64gernn(cq, cp, vu10)
+      end subroutine test_xvf64gernn_u1
+
+!CHECK-LABEL: @test_xvf64gernn_u1_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+
+      subroutine test_xvf64gernn_r8()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64gernn(cq, cp, vr80)
+      end subroutine test_xvf64gernn_r8
+
+!CHECK-LABEL: @test_xvf64gernn_r8_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.xvf64gernn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_xvf64gernp_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64gernp(cq, cp, vu10)
+      end subroutine test_xvf64gernp_u1
+
+!CHECK-LABEL: @test_xvf64gernp_u1_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_xvf64gernp_r8()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vr80
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64gernp(cq, cp, vr80)
+      end subroutine test_xvf64gernp_r8
+
+!CHECK-LABEL: @test_xvf64gernp_r8_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_xvf64gerpn_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64gerpn(cq, cp, vu10)
+      end subroutine test_xvf64gerpn_u1
+
+!CHECK-LABEL: @test_xvf64gerpn_u1_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+      subroutine test_xvf64gerpn_r8()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64gerpn(cq, cp, vr80)
+      end subroutine test_xvf64gerpn_r8
+
+!CHECK-LABEL: @test_xvf64gerpn_r8_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.xvf64gerpn(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_xvf64gerpp_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64gerpp(cq, cp, vu10)
+      end subroutine test_xvf64gerpp_u1
+
+!CHECK-LABEL: @test_xvf64gerpp_u1_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %2, align 64
+
+
+      subroutine test_xvf64gerpp_r8()
+      use, intrinsic :: mma
+      implicit none
+      vector(real(8)) vr80
+      __vector_quad :: cq
+      __vector_pair :: cp
+      call mma_xvf64gerpp(cq, cp, vr80)
+      end subroutine test_xvf64gerpp_r8
+
+!CHECK-LABEL: @test_xvf64gerpp_r8_
+!CHECK:  %1 = alloca <256 x i1>, i64 1, align 32
+!CHECK:  %2 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %3 = alloca <2 x double>, i64 1, align 16
+!CHECK:  %4 = load <256 x i1>, ptr %1, align 32
+!CHECK:  %5 = load <2 x double>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %2, align 64
+!CHECK:  %7 = bitcast <2 x double> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %6, <256 x i1> %4, <16 x i8> %7)
+!CHECK:  store <512 x i1> %8, ptr %2, align 64
+
+      subroutine test_xvi16ger2_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvi16ger2(cq, vu10, vu11)
+      end subroutine test_xvi16ger2_u1
+
+!CHECK-LABEL: @test_xvi16ger2_u1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_xvi16ger2_i2()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_xvi16ger2(cq, vi20, vi21)
+      end subroutine test_xvi16ger2_i2
+
+!CHECK-LABEL: @test_xvi16ger2_i2_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:  %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:  %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:  %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:  %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:  %8 = call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %6, <16 x i8> %7)
+!CHECK:  store <512 x i1> %8, ptr %1, align 64
+
+      subroutine test_xvi16ger2pp_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvi16ger2pp(cq, vu10, vu11)
+      end subroutine test_xvi16ger2pp_u1
+
+!CHECK-LABEL: @test_xvi16ger2pp_u1_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:   %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:   %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:   %7 = call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:   store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvi16ger2pp_i2()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_xvi16ger2pp(cq, vi20, vi21)
+      end subroutine test_xvi16ger2pp_i2
+
+!CHECK-LABEL: @test_xvi16ger2pp_i2_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:   %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:   %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:   %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:   %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:   %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:   %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:   %9 = call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK:   store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_xvi16ger2s_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvi16ger2s(cq, vu10, vu11)
+      end subroutine test_xvi16ger2s_u1
+
+!CHECK-LABEL:  @test_xvi16ger2s_u1_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:   %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:   %6 = call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %4, <16 x i8> %5)
+!CHECK:   store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_xvi16ger2s_i2()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_xvi16ger2s(cq, vi20, vi21)
+      end subroutine test_xvi16ger2s_i2
+
+!CHECK-LABEL:  @test_xvi16ger2s_i2_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:   %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:   %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:   %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:   %6 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:   %7 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:   %8 = call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %6, <16 x i8> %7)
+!CHECK:   store <512 x i1> %8, ptr %1, align 64
+
+      subroutine test_xvi16ger2spp_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvi16ger2spp(cq, vu10, vu11)
+      end subroutine test_xvi16ger2spp_u1
+
+!CHECK-LABEL:  @test_xvi16ger2spp_u1_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:   %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:   %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:   %7 = call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:   store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvi16ger2spp_i2()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(2)) vi20, vi21
+      __vector_quad :: cq
+      call mma_xvi16ger2spp(cq, vi20, vi21)
+      end subroutine test_xvi16ger2spp_i2
+
+!CHECK-LABEL:  @test_xvi16ger2spp_i2_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <8 x i16>, i64 1, align 16
+!CHECK:   %3 = alloca <8 x i16>, i64 1, align 16
+!CHECK:   %4 = load <8 x i16>, ptr %2, align 16
+!CHECK:   %5 = load <8 x i16>, ptr %3, align 16
+!CHECK:   %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:   %7 = bitcast <8 x i16> %4 to <16 x i8>
+!CHECK:   %8 = bitcast <8 x i16> %5 to <16 x i8>
+!CHECK:   %9 = call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %6, <16 x i8> %7, <16 x i8> %8)
+!CHECK:   store <512 x i1> %9, ptr %1, align 64
+
+      subroutine test_xvi4ger8()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvi4ger8(cq, vu10, vu11)
+      end subroutine test_xvi4ger8
+
+!CHECK-LABEL:  @test_xvi4ger8_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:   %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:   %6 = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> %4, <16 x i8> %5)
+!CHECK:   store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_xvi4ger8pp()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvi4ger8pp(cq, vu10, vu11)
+      end subroutine test_xvi4ger8pp
+
+!CHECK-LABEL:  @test_xvi4ger8pp_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:   %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:   %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:   %7 = call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:   store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvi8ger4_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvi8ger4(cq, vu10, vu11)
+      end subroutine test_xvi8ger4_u1
+
+!CHECK-LABEL: @test_xvi8ger4_u1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+
+      subroutine test_xvi8ger4_i1()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(1)) vi10, vi11
+      __vector_quad :: cq
+      call mma_xvi8ger4(cq, vi10, vi11)
+      end subroutine test_xvi8ger4_i1
+
+!CHECK-LABEL: @test_xvi8ger4_i1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %6, ptr %1, align 64
+
+      subroutine test_xvi8ger4pp_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvi8ger4pp(cq, vu10, vu11)
+      end subroutine test_xvi8ger4pp_u1
+
+!CHECK-LABEL: @test_xvi8ger4pp_u1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvi8ger4pp_i1()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(1)) vi10, vi11
+      __vector_quad :: cq
+      call mma_xvi8ger4pp(cq, vi10, vi11)
+      end subroutine test_xvi8ger4pp_i1
+
+!CHECK-LABEL:  @test_xvi8ger4pp_i1_
+!CHECK:   %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:   %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:   %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:   %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:   %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:   %7 = call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:   store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvi8ger4spp_u1()
+      use, intrinsic :: mma
+      implicit none
+      vector(unsigned(1)) vu10, vu11
+      __vector_quad :: cq
+      call mma_xvi8ger4spp(cq, vu10, vu11)
+      end subroutine test_xvi8ger4spp_u1
+
+!CHECK-LABEL: @test_xvi8ger4spp_u1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64
+
+      subroutine test_xvi8ger4spp_i1()
+      use, intrinsic :: mma
+      implicit none
+      vector(integer(1)) vi10, vi11
+      __vector_quad :: cq
+      call mma_xvi8ger4spp(cq, vi10, vi11)
+      end subroutine test_xvi8ger4spp_i1
+
+!CHECK-LABEL: @test_xvi8ger4spp_i1_
+!CHECK:  %1 = alloca <512 x i1>, i64 1, align 64
+!CHECK:  %2 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %3 = alloca <16 x i8>, i64 1, align 16
+!CHECK:  %4 = load <16 x i8>, ptr %2, align 16
+!CHECK:  %5 = load <16 x i8>, ptr %3, align 16
+!CHECK:  %6 = load <512 x i1>, ptr %1, align 64
+!CHECK:  %7 = call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %6, <16 x i8> %4, <16 x i8> %5)
+!CHECK:  store <512 x i1> %7, ptr %1, align 64


        


More information about the flang-commits mailing list