[llvm] [PowerPC] Add dense math bfloat16 floating-point outer-product accumulate to DMR instructions (PR #133109)

Maryam Moghadas via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 26 08:45:22 PDT 2025


https://github.com/maryammo created https://github.com/llvm/llvm-project/pull/133109

This patch adds the following Dense Math Facility bfloat16 floating-point calculation instructions: dmxvbf16gerx2, dmxvbf16gerx2pp,dmxvbf16gerx2pn, dmxvbf16gerx2np, dmxvbf16gerx2nn, pmdmxvbf16gerx2, pmdmxvbf16gerx2pp, pmdmxvbf16gerx2pn, pmdmxvbf16gerx2np, pmdmxvbf16gerx2nn, along with their corresponding intrinsics and tests.

>From aa2fd47a918e52e5d6e37d07755f77d1016e51ab Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo at ca.ibm.com>
Date: Wed, 26 Mar 2025 15:43:07 +0000
Subject: [PATCH] [PowerPC] Add dense math bfloat16 floating-point
 outer-product accumulate to DMR instructions

This patch adds the following Dense Math Facility bfloat16 floating-point
calculation instructions: dmxvbf16gerx2, dmxvbf16gerx2pp,dmxvbf16gerx2pn,
dmxvbf16gerx2np, dmxvbf16gerx2nn, pmdmxvbf16gerx2, pmdmxvbf16gerx2pp,
pmdmxvbf16gerx2pn, pmdmxvbf16gerx2np, pmdmxvbf16gerx2nn, along with their
corresponding intrinsics and tests.
---
 llvm/include/llvm/IR/IntrinsicsPowerPC.td     |  23 +
 llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td  | 162 +++++-
 .../test/CodeGen/PowerPC/dmf-outer-product.ll | 484 ++++++++++++++++++
 .../PowerPC/ppc-encoding-ISAFuture.txt        |  32 +-
 .../PowerPC/ppc64le-encoding-ISAFuture.txt    |  30 ++
 llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s |  50 ++
 6 files changed, 779 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index e4d39134a4a25..b57102ef68f09 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -280,6 +280,22 @@ multiclass PowerPC_MMA_ACC_PP_Intrinsic<list<LLVMType> args> {
                                  [IntrNoMem]>;
 }
 
+multiclass PowerPC_MMA_DMR_Intrinsic<list<LLVMType> args> {
+  def NAME: DefaultAttrsIntrinsic<[llvm_v1024i1_ty], args, [IntrNoMem]>;
+  def pp : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+                                 !listconcat([llvm_v1024i1_ty], args),
+                                 [IntrNoMem]>;
+  def pn : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+                                 !listconcat([llvm_v1024i1_ty], args),
+                                 [IntrNoMem]>;
+  def np : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+                                 !listconcat([llvm_v1024i1_ty], args),
+                                 [IntrNoMem]>;
+  def nn : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+                                 !listconcat([llvm_v1024i1_ty], args),
+                                 [IntrNoMem]>;
+}
+
 multiclass PowerPC_MMA_DMR_PP_Intrinsic<list<LLVMType> args> {
   def NAME: DefaultAttrsIntrinsic<[llvm_v1024i1_ty], args, [IntrNoMem]>;
   def pp : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
@@ -1732,6 +1748,13 @@ let TargetPrefix = "ppc" in {
                             [llvm_v1024i1_ty, llvm_v256i1_ty, llvm_v16i8_ty,
                              llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
                             [IntrNoMem]>;
+
+  // MMA+ Reduced-Precision: bfloat16 Outer Product Intrinsic Definitions.
+  defm int_ppc_mma_dmxvbf16gerx2 :
+       PowerPC_MMA_DMR_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmdmxvbf16gerx2 :
+       PowerPC_MMA_DMR_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty, llvm_i32_ty,
+                                     llvm_i32_ty, llvm_i32_ty]>;
 }
 
 // XL Compat intrinsics.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
index d4f0e222b457c..8ea0924f09b43 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
@@ -95,7 +95,7 @@ class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
                                  list<dag> pattern>
   : PI<1, opcode, OOL, IOL, asmstr, itin> {
   bits<3> AT;
-  bits<6> XAp;
+  bits<5> XAp;
   bits<6> XB;
   bits<8> XMSK;
   bits<4> YMSK;
@@ -123,6 +123,40 @@ class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
   let Inst{63} = 0;
 }
 
+class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                                 string asmstr, InstrItinClass itin,
+                                 list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<5> XAp;
+  bits<6> XB;
+  bits<8> XMSK;
+  bits<4> YMSK;
+  bits<2> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-17} = PMSK;
+  let Inst{18-19} = 0;
+  let Inst{20-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-46} = XAp{3-0};
+  let Inst{47} = 0;
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XAp{4};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
 multiclass DMR_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
                        string asmstr> {
   let Predicates = [MMA, IsISAFuture] in {
@@ -159,6 +193,83 @@ multiclass DMR_UM_M448_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
   }
 }
 
+multiclass DMR_BF16_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                       string asmstr> {
+  let Predicates = [MMA, IsISAFuture] in {
+  def NAME :
+    XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x11), (outs dmr:$AT), IOL,
+                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PP :
+    XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+                     !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+multiclass DMR_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : DMR_BF16_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
+  def PM#NAME :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, !or(xo, 0x11), (outs dmr:$AT),
+      !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, xo, (outs dmr:$AT),
+      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+multiclass DMR_NEG_UM_M284_XOXORf939a0<bits<6> opcode, bits<8> xo, dag IOL,
+                                  string asmbase, string asmstr> {
+  defm NAME : DMR_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, IsISAFuture] in {
+  def PN : XX3Form_AT3_XAp5B6<
+             opcode, !xor(xo, 0xF9), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NP : XX3Form_AT3_XAp5B6<
+             opcode, !xor(xo, 0x39), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NN : XX3Form_AT3_XAp5B6<
+             opcode, !xor(xo, 0xA0), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+  let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
+   def PM#NAME#PN :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, !xor(xo, 0xF9), (outs dmr:$AT),
+      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, !xor(xo, 0x39), (outs dmr:$AT),
+      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, !xor(xo, 0xA0), (outs dmr:$AT),
+      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
 let Predicates = [IsISAFuture] in {
   def DMXXEXTFDMR512 : XX3Form_AT3_XABp5_P1<60, 226,
                                             (outs vsrprc:$XAp, vsrprc:$XBp),
@@ -231,6 +342,11 @@ let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
     RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
 }
 
+// DMXVBF16GERX2, DMXVBF16GERX2PP, DMXVBF16GERX2PN, dMXVBF16GERX2NP, DMXVBF16GERX2NN
+// PMDMXVBF16GERX2, PMDMXVBF16GERX2PP, PMDMXVBF16GERX2PN, PMDMXVBF16GERX2NP, PMDMXVBF16GERX2NN
+defm DMXVBF16GERX2 : DMR_NEG_UM_M284_XOXORf939a0<59, 74, (ins vsrprc:$XAp, vsrc:$XB),
+                                         "dmxvbf16gerx2", "$AT, $XAp, $XB">;
+
 // MMA+ Intrinsics
 let Predicates = [MMA, IsISAFuture] in {
   def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4 v256i1:$XAp, v16i8:$XB)),
@@ -240,6 +356,21 @@ let Predicates = [MMA, IsISAFuture] in {
 
   def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
             (DMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2 v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2 $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>;
 }
 
 let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
@@ -259,4 +390,33 @@ let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
                                                Msk4Imm:$PMSK)),
             (PMDMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
                            Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
+                                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                        Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+                                              Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                                              Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+                                               Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+                                              Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                                              Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+                                               Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 }
diff --git a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
index cba52567c900d..e3b43062f417c 100644
--- a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
@@ -285,3 +285,487 @@ entry:
   store <1024 x i1> %call, ptr %resp, align 64
   ret void
 }
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxv vs0, 0(r4)
+; CHECK-NEXT:    lxv v3, 0(r3)
+; CHECK-NEXT:    dmxvbf16gerx2 dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r5)
+; CHECK-NEXT:    stxvp vsp36, 64(r5)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r5)
+; CHECK-NEXT:    stxvp vsp36, 0(r5)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r4)
+; CHECK-BE-NEXT:    lxv v3, 16(r3)
+; CHECK-BE-NEXT:    dmxvbf16gerx2 dmr0, vsp34, vs0
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r5)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r5)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r5)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r5)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1> %v1, <16 x i8> %v2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2pp:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    dmxvbf16gerx2pp dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2pp:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    dmxvbf16gerx2pp dmr0, vsp34, vs0
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pn(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2pn:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    dmxvbf16gerx2pn dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2pn:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    dmxvbf16gerx2pn dmr0, vsp34, vs0
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pn(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2np(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2np:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    dmxvbf16gerx2np dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2np:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    dmxvbf16gerx2np dmr0, vsp34, vs0
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2np(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2nn(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2nn:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    dmxvbf16gerx2nn dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2nn:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    dmxvbf16gerx2nn dmr0, vsp34, vs0
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2nn(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2(<256 x i1>, <16 x i8>, i32, i32, i32)
+
+define void @test_pmdmxvbf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_pmdmxvbf16gerx2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxv vs0, 0(r4)
+; CHECK-NEXT:    lxv v3, 0(r3)
+; CHECK-NEXT:    pmdmxvbf16gerx2 dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r5)
+; CHECK-NEXT:    stxvp vsp36, 64(r5)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r5)
+; CHECK-NEXT:    stxvp vsp36, 0(r5)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_pmdmxvbf16gerx2:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r4)
+; CHECK-BE-NEXT:    lxv v3, 16(r3)
+; CHECK-BE-NEXT:    pmdmxvbf16gerx2 dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r5)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r5)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r5)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r5)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2(<256 x i1> %v1, <16 x i8> %v2, i32 33, i32 5, i32 2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pp(<1024 x i1>, <256 x i1>, <16 x i8>, i32, i32, i32)
+
+define void @test_pmdmxvbf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_pmdmxvbf16gerx2pp:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    pmdmxvbf16gerx2pp dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_pmdmxvbf16gerx2pp:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    pmdmxvbf16gerx2pp dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pp(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2, i32 33, i32 5, i32 2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pn(<1024 x i1>, <256 x i1>, <16 x i8>, i32, i32, i32)
+
+define void @test_pmdmxvbf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_pmdmxvbf16gerx2pn:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    pmdmxvbf16gerx2pn dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_pmdmxvbf16gerx2pn:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    pmdmxvbf16gerx2pn dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pn(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2, i32 33, i32 5, i32 2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2np(<1024 x i1>, <256 x i1>, <16 x i8>, i32, i32, i32)
+
+define void @test_pmdmxvbf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_pmdmxvbf16gerx2np:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    pmdmxvbf16gerx2np dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_pmdmxvbf16gerx2np:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    pmdmxvbf16gerx2np dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2np(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2, i32 33, i32 5, i32 2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2nn(<1024 x i1>, <256 x i1>, <16 x i8>, i32, i32, i32)
+
+define void @test_pmdmxvbf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_pmdmxvbf16gerx2nn:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    pmdmxvbf16gerx2nn dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_pmdmxvbf16gerx2nn:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    pmdmxvbf16gerx2nn dmr0, vsp34, vs0, 33, 5, 2
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2nn(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2, i32 33, i32 5, i32 2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index 5fa01371188cd..c3b5fa36641c6 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -92,4 +92,34 @@
 0xec,0x82,0x23,0x10
 
 #CHECK: pmdmxvi8gerx4spp 0, 2, 4, 8, 4, 4
-[0x07,0x90,0x40,0x84,0xec,0x02,0x23,0x10]
+0x07,0x90,0x40,0x84,0xec,0x02,0x23,0x10
+
+#CHECK: dmxvbf16gerx2 1, 2, 4
+0xec,0x82,0x22,0xd8
+
+#CHECK: dmxvbf16gerx2pp 1, 2, 4
+0xec,0x82,0x22,0x50
+
+#CHECK: dmxvbf16gerx2pn 1, 2, 4
+0xec,0x82,0x25,0x98
+
+#CHECK: dmxvbf16gerx2np 1, 2, 4
+0xec,0x82,0x23,0x98
+
+#CHECK: dmxvbf16gerx2nn 1, 2, 4
+0xec,0x82,0x27,0x50
+
+#CHECK: pmdmxvbf16gerx2 1, 2, 4, 8, 4, 2
+0x07,0x90,0x80,0x84,0xec,0x82,0x22,0xd8
+
+#CHECK: pmdmxvbf16gerx2pp 1, 2, 4, 8, 4, 2
+0x07,0x90,0x80,0x84,0xec,0x82,0x22,0x50
+
+#CHECK: pmdmxvbf16gerx2pn 1, 2, 4, 8, 4, 2
+0x07,0x90,0x80,0x84,0xec,0x82,0x25,0x98
+
+#CHECK: pmdmxvbf16gerx2np 1, 2, 4, 8, 4, 2
+0x07,0x90,0x80,0x84,0xec,0x82,0x23,0x98
+
+#CHECK: pmdmxvbf16gerx2nn 1, 2, 4, 8, 4, 2
+0x07,0x90,0x80,0x84,0xec,0x82,0x27,0x50
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index 72660f97c3757..2691e243191f9 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -87,3 +87,33 @@
 
 #CHECK: pmdmxvi8gerx4spp 0, 2, 4, 8, 4, 4
 0x84,0x40,0x90,0x07,0x10,0x23,0x02,0xec
+
+#CHECK: dmxvbf16gerx2 1, 2, 4
+0xd8,0x22,0x82,0xec
+
+#CHECK: dmxvbf16gerx2pp 1, 2, 4
+0x50,0x22,0x82,0xec
+
+#CHECK: dmxvbf16gerx2pn 1, 2, 4
+0x98,0x25,0x82,0xec
+
+#CHECK: dmxvbf16gerx2np 1, 2, 4
+0x98,0x23,0x82,0xec
+
+#CHECK: dmxvbf16gerx2nn 1, 2, 4
+0x50,0x27,0x82,0xec
+
+#CHECK: pmdmxvbf16gerx2 1, 2, 4, 8, 4, 2
+0x84,0x80,0x90,0x07,0xd8,0x22,0x82,0xec
+
+#CHECK: pmdmxvbf16gerx2pp 1, 2, 4, 8, 4, 2
+0x84,0x80,0x90,0x07,0x50,0x22,0x82,0xec
+
+#CHECK: pmdmxvbf16gerx2pn 1, 2, 4, 8, 4, 2
+0x84,0x80,0x90,0x07,0x98,0x25,0x82,0xec
+
+#CHECK: pmdmxvbf16gerx2np 1, 2, 4, 8, 4, 2
+0x84,0x80,0x90,0x07,0x98,0x23,0x82,0xec
+
+#CHECK: pmdmxvbf16gerx2nn 1, 2, 4, 8, 4, 2
+0x84,0x80,0x90,0x07,0x50,0x27,0x82,0xec
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index f8d3f7741e52b..fe512e7e42382 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -126,3 +126,53 @@
 #CHECK-BE-SAME:                                                  0xec,0x02,0x23,0x10]
 #CHECK-LE:  pmdmxvi8gerx4spp 0, 2, 4, 8, 4, 4       # encoding: [0x84,0x40,0x90,0x07,
 #CHECK-LE-SAME:                                                  0x10,0x23,0x02,0xec]
+
+            dmxvbf16gerx2 1, 2, 4
+#CHECK-BE:  dmxvbf16gerx2 1, 2, 4                   # encoding: [0xec,0x82,0x22,0xd8]
+#CHECK-LE:  dmxvbf16gerx2 1, 2, 4                   # encoding: [0xd8,0x22,0x82,0xec]
+
+            dmxvbf16gerx2pp 1, 2, 4
+#CHECK-BE:  dmxvbf16gerx2pp 1, 2, 4                 # encoding: [0xec,0x82,0x22,0x50]
+#CHECK-LE:  dmxvbf16gerx2pp 1, 2, 4                 # encoding: [0x50,0x22,0x82,0xec]
+
+            dmxvbf16gerx2pn 1, 2, 4
+#CHECK-BE:  dmxvbf16gerx2pn 1, 2, 4                 # encoding: [0xec,0x82,0x25,0x98]
+#CHECK-LE:  dmxvbf16gerx2pn 1, 2, 4                 # encoding: [0x98,0x25,0x82,0xec]
+
+            dmxvbf16gerx2np 1, 2, 4
+#CHECK-BE:  dmxvbf16gerx2np 1, 2, 4                 # encoding: [0xec,0x82,0x23,0x98]
+#CHECK-LE:  dmxvbf16gerx2np 1, 2, 4                 # encoding: [0x98,0x23,0x82,0xec]
+
+            dmxvbf16gerx2nn 1, 2, 4
+#CHECK-BE:  dmxvbf16gerx2nn 1, 2, 4                 # encoding: [0xec,0x82,0x27,0x50]
+#CHECK-LE:  dmxvbf16gerx2nn 1, 2, 4                 # encoding: [0x50,0x27,0x82,0xec]
+
+            pmdmxvbf16gerx2 1, 2, 4, 8, 4, 2
+#CHECK-BE:  pmdmxvbf16gerx2 1, 2, 4, 8, 4, 2        # encoding: [0x07,0x90,0x80,0x84,
+#CHECK-BE-SAME:                                                  0xec,0x82,0x22,0xd8]
+#CHECK-LE:  pmdmxvbf16gerx2 1, 2, 4, 8, 4, 2        # encoding: [0x84,0x80,0x90,0x07,
+#CHECK-LE-SAME:                                                  0xd8,0x22,0x82,0xec]
+
+            pmdmxvbf16gerx2pp 1, 2, 4, 8, 4, 2
+#CHECK-BE:  pmdmxvbf16gerx2pp 1, 2, 4, 8, 4, 2      # encoding: [0x07,0x90,0x80,0x84,
+#CHECK-BE-SAME:                                                  0xec,0x82,0x22,0x50]
+#CHECK-LE:  pmdmxvbf16gerx2pp 1, 2, 4, 8, 4, 2      # encoding: [0x84,0x80,0x90,0x07,
+#CHECK-LE-SAME:                                                  0x50,0x22,0x82,0xec]
+
+            pmdmxvbf16gerx2pn 1, 2, 4, 8, 4, 2
+#CHECK-BE:  pmdmxvbf16gerx2pn 1, 2, 4, 8, 4, 2      # encoding: [0x07,0x90,0x80,0x84,
+#CHECK-BE-SAME:                                                  0xec,0x82,0x25,0x98]
+#CHECK-LE:  pmdmxvbf16gerx2pn 1, 2, 4, 8, 4, 2      # encoding: [0x84,0x80,0x90,0x07,
+#CHECK-LE-SAME:                                                  0x98,0x25,0x82,0xec]
+
+            pmdmxvbf16gerx2np 1, 2, 4, 8, 4, 2
+#CHECK-BE:  pmdmxvbf16gerx2np 1, 2, 4, 8, 4, 2      # encoding: [0x07,0x90,0x80,0x84,
+#CHECK-BE-SAME:                                                  0xec,0x82,0x23,0x98]
+#CHECK-LE:  pmdmxvbf16gerx2np 1, 2, 4, 8, 4, 2      # encoding: [0x84,0x80,0x90,0x07,
+#CHECK-LE-SAME:                                                  0x98,0x23,0x82,0xec]
+
+            pmdmxvbf16gerx2nn 1, 2, 4, 8, 4, 2
+#CHECK-BE:  pmdmxvbf16gerx2nn 1, 2, 4, 8, 4, 2      # encoding: [0x07,0x90,0x80,0x84,
+#CHECK-BE-SAME:                                                  0xec,0x82,0x27,0x50]
+#CHECK-LE:  pmdmxvbf16gerx2nn 1, 2, 4, 8, 4, 2      # encoding: [0x84,0x80,0x90,0x07,
+#CHECK-LE-SAME:                                                  0x50,0x27,0x82,0xec]



More information about the llvm-commits mailing list