[llvm] [AMDGPU] Support V_FMA_MIX*_BF16 instructions on gfx1250 (PR #150381)

Thu Jul 24 00:15:13 PDT 2025

https://github.com/changpeng created https://github.com/llvm/llvm-project/pull/150381

None

>From 0f6c52b32a94986a26ee61cf11ea559ebb8bd6f2 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Thu, 24 Jul 2025 00:08:33 -0700
Subject: [PATCH] [AMDGPU] Support V_FMA_MIX*_BF16 instructions on gfx1250

Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |  10 +
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 135 +++-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   8 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   3 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  10 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   4 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   | 138 ++--
 llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll      | 634 ++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll   | 189 ++++++
 llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll   | 540 +++++++++++++++
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s       | 168 +++++
 .../AMDGPU/gfx1250_dasm_vop3p.txt             | 126 ++++
 12 files changed, 1874 insertions(+), 91 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 2a36f3dea34ce..d6298c4ebf24a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
   "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
 >;
 
+def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts",
+  "HasFmaMixBF16Insts",
+  "true",
+  "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions"
+>;
+
 def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts",
   "HasIEEEMinimumMaximumInsts",
   "true",
@@ -2007,6 +2013,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureBF16ConversionInsts,
    FeatureBF16PackedInsts,
    FeatureCvtPkF16F32Inst,
+   FeatureFmaMixBF16Insts,
    FeatureMin3Max3PKF16,
    FeatureMinimum3Maximum3PKF16,
    FeaturePrngInst,
@@ -2599,6 +2606,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
 def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
   AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
 
+def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">,
+  AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>;
+
 def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
   AssemblerPredicate<(all_of FeatureDLInsts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5a2416debb417..0ca2286c11c94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3861,58 +3861,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+// Match lowered fpext from bf16 to f32. This is a bit operation extending
+// a 16-bit value with 16-bit of zeroes at LSB:
+//
+// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
+// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
+// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
+static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
+  if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
+    return SDValue();
+  Op = Op.getOperand(0);
+
+  IsExtractHigh = false;
+  if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
+    auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+    if (!Low16 || !Low16->isZero())
+      return SDValue();
+    Op = stripBitcast(Op.getOperand(1));
+    if (Op.getValueType() != MVT::bf16)
+      return SDValue();
+    return Op;
+  }
+
+  if (Op.getValueType() != MVT::i32)
+    return SDValue();
+
+  if (Op.getOpcode() == ISD::AND) {
+    if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      if (Mask->getZExtValue() == 0xffff0000) {
+        IsExtractHigh = true;
+        return Op.getOperand(0);
+      }
+    }
+    return SDValue();
+  }
+
+  if (Op.getOpcode() == ISD::SHL) {
+    if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      if (Amt->getZExtValue() == 16)
+        return Op.getOperand(0);
+    }
+  }
+
+  return SDValue();
+}
+
 // The return value is not whether the match is possible (which it always is),
 // but whether or not it a conversion is really used.
 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
-                                                   unsigned &Mods) const {
+                                                   unsigned &Mods,
+                                                   MVT VT) const {
   Mods = 0;
   SelectVOP3ModsImpl(In, Src, Mods);
 
+  bool IsExtractHigh = false;
   if (Src.getOpcode() == ISD::FP_EXTEND) {
     Src = Src.getOperand(0);
-    assert(Src.getValueType() == MVT::f16);
-    Src = stripBitcast(Src);
+  } else if (VT == MVT::bf16) {
+    SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
+    if (!B16)
+      return false;
+    Src = B16;
+  } else
+    return false;
 
-    // Be careful about folding modifiers if we already have an abs. fneg is
-    // applied last, so we don't want to apply an earlier fneg.
-    if ((Mods & SISrcMods::ABS) == 0) {
-      unsigned ModsTmp;
-      SelectVOP3ModsImpl(Src, Src, ModsTmp);
+  if (Src.getValueType() != VT &&
+      (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
+    return false;
 
-      if ((ModsTmp & SISrcMods::NEG) != 0)
-        Mods ^= SISrcMods::NEG;
+  Src = stripBitcast(Src);
 
-      if ((ModsTmp & SISrcMods::ABS) != 0)
-        Mods |= SISrcMods::ABS;
-    }
+  // Be careful about folding modifiers if we already have an abs. fneg is
+  // applied last, so we don't want to apply an earlier fneg.
+  if ((Mods & SISrcMods::ABS) == 0) {
+    unsigned ModsTmp;
+    SelectVOP3ModsImpl(Src, Src, ModsTmp);
+
+    if ((ModsTmp & SISrcMods::NEG) != 0)
+      Mods ^= SISrcMods::NEG;
 
-    // op_sel/op_sel_hi decide the source type and source.
-    // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
-    // If the sources's op_sel is set, it picks the high half of the source
-    // register.
+    if ((ModsTmp & SISrcMods::ABS) != 0)
+      Mods |= SISrcMods::ABS;
+  }
 
-    Mods |= SISrcMods::OP_SEL_1;
-    if (isExtractHiElt(Src, Src)) {
-      Mods |= SISrcMods::OP_SEL_0;
+  // op_sel/op_sel_hi decide the source type and source.
+  // If the source's op_sel_hi is set, it indicates to do a conversion from
+  // fp16. If the sources's op_sel is set, it picks the high half of the source
+  // register.
 
-      // TODO: Should we try to look for neg/abs here?
-    }
+  Mods |= SISrcMods::OP_SEL_1;
+  if (IsExtractHigh ||
+      (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
+    Mods |= SISrcMods::OP_SEL_0;
 
-    // Prevent unnecessary subreg COPY to VGPR_16
-    if (Src.getOpcode() == ISD::TRUNCATE &&
-        Src.getOperand(0).getValueType() == MVT::i32) {
-      Src = Src.getOperand(0);
-    }
-    return true;
+    // TODO: Should we try to look for neg/abs here?
   }
 
-  return false;
+  // Prevent unnecessary subreg COPY to VGPR_16
+  if (Src.getOpcode() == ISD::TRUNCATE &&
+      Src.getOperand(0).getValueType() == MVT::i32) {
+    Src = Src.getOperand(0);
+  }
+  return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
                                                   SDValue &SrcMods) const {
   unsigned Mods = 0;
-  if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
+  if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
     return false;
   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
   return true;
@@ -3921,7 +3977,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
                                                SDValue &SrcMods) const {
   unsigned Mods = 0;
-  SelectVOP3PMadMixModsImpl(In, Src, Mods);
+  SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+                                                      SDValue &SrcMods) const {
+  unsigned Mods = 0;
+  if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
+    return false;
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+                                                   SDValue &SrcMods) const {
+  unsigned Mods = 0;
+  SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 6123d75d7b616..7ecba1e24ff51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -254,11 +254,15 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
   bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
-                                 unsigned &Mods) const;
+  bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods,
+                                 MVT VT) const;
   bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
                                 SDValue &SrcMods) const;
   bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
+                                    SDValue &SrcMods) const;
+  bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
+                                 SDValue &SrcMods) const;
 
   bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
                    SDValue &Tbl) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 0435e7f9e51d2..0683f02955594 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -123,6 +123,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasSMemRealTime = false;
   bool HasIntClamp = false;
   bool HasFmaMixInsts = false;
+  bool HasFmaMixBF16Insts = false;
   bool HasMovrel = false;
   bool HasVGPRIndexMode = false;
   bool HasScalarDwordx3Loads = false;
@@ -462,6 +463,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasFmaMixInsts;
   }
 
+  bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; }
+
   bool hasCARRY() const {
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f1a8ee118356e..6d963b77850f4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1061,10 +1061,12 @@ ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
 // where this is OK to use.
 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
                                        EVT DestVT, EVT SrcVT) const {
-  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
-          (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
-         DestVT.getScalarType() == MVT::f32 &&
-         SrcVT.getScalarType() == MVT::f16 &&
+  return DestVT.getScalarType() == MVT::f32 &&
+         ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+            (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+           SrcVT.getScalarType() == MVT::f16) ||
+          (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
+           SrcVT.getScalarType() == MVT::bf16)) &&
          // TODO: This probably only requires no input flushing?
          denormalModeIsFlushAllF32(DAG.getMachineFunction());
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 485ca78db93a7..b0be3f864b94d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1662,6 +1662,8 @@ def VOP3OpSelMods  : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
 
 def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
 def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
+def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
+def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
 
 def VINTERPMods  : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
 def VINTERPModsHi  : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
@@ -2866,6 +2868,7 @@ def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=
 
 def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
 def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>;
 
 def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>;
 def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>;
@@ -2917,6 +2920,7 @@ def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=
 def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>;
 def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
 def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
+def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>;
 def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
 def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
 def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index ea14c77cdff0b..7017da9dc3521 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -35,14 +35,18 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
                     bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
     bit UseTiedOutput = useTiedOutput;
 
+    defvar Src0RC = getVCSrcForVT<P.Src0VT>.ret;
+    defvar Src1RC = getVCSrcForVT<P.Src1VT>.ret;
+    defvar Src2RC = getVCSrcForVT<P.Src2VT>.ret;
+
     dag srcs =
-          (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
-               FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
-               FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+          (ins FP16InputMods:$src0_modifiers, Src0RC:$src0,
+               FP16InputMods:$src1_modifiers, Src1RC:$src1,
+               FP16InputMods:$src2_modifiers, Src2RC:$src2);
     dag dpp_srcs =
           (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
                FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1,
-               FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+               FP16InputMods:$src2_modifiers, Src2RC:$src2);
 
            // FIXME: Clamp0 misbehaves with the non-default vdst_in
            // following it. For now workaround this by requiring clamp
@@ -161,38 +165,42 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
 multiclass MadFmaMixPats<SDPatternOperator fma_like,
                          Instruction mix_inst,
                          Instruction mixlo_inst,
-                         Instruction mixhi_inst> {
+                         Instruction mixhi_inst,
+                         ValueType VT = f16,
+                         ValueType vecVT = v2f16> {
+  defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
+  defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
   // At least one of the operands needs to be an fpextend of an f16
   // for this to be worthwhile, so we need three patterns here.
   // TODO: Could we use a predicate to inspect src1/2/3 instead?
   def : GCNPat <
-    (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
-                   (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
-                   (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))),
+    (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+                   (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)),
+                   (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))),
     (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
               DSTCLAMP.NONE)>;
   def : GCNPat <
-    (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
-                   (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)),
-                   (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))),
+    (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+                   (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)),
+                   (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))),
     (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
               DSTCLAMP.NONE)>;
   def : GCNPat <
-    (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
-                   (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)),
-                   (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))),
+    (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)),
+                   (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)),
+                   (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
     (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
               DSTCLAMP.NONE)>;
 
   def : GCNPat <
     (AMDGPUclamp (build_vector
-      (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
-      (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
-    (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+      (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
+                        (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
+                        (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
+      (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
+                        (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
+                        (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
+    (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0,
                        $hi_src1_modifiers, $hi_src1,
                        $hi_src2_modifiers, $hi_src2,
                        DSTCLAMP.ENABLE,
@@ -204,8 +212,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   >;
 
   def : GCNPat <
-    (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
-                        (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+    (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+                       (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
     (mixlo_inst $src0_modifiers, $src0,
                 $src1_modifiers, $src1,
                 (i32 0), (i32 0),
@@ -214,9 +222,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   >;
 
   def : GCNPat <
-    (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
-                                            (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
-    (v2f16 (mixhi_inst $src0_modifiers, $src0,
+    (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+                                          (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))),
+    (vecVT (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        (i32 0), (i32 0),
                        DSTCLAMP.NONE,
@@ -224,9 +232,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   >;
 
   def : GCNPat <
-    (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                            (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                            (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+    (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+                           (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+                           (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
     (mixlo_inst $src0_modifiers, $src0,
                 $src1_modifiers, $src1,
                 $src2_modifiers, $src2,
@@ -241,10 +249,10 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   let True16Predicate = p in {
 
   def : GCNPat <
-    (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                                     (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                                     (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
-    (v2f16 (mixhi_inst $src0_modifiers, $src0,
+    (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+                                                   (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+                                                   (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+    (vecVT (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
                        DSTCLAMP.NONE,
@@ -253,11 +261,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
 
   def : GCNPat <
     (build_vector
-      f16:$elt0,
-      (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                      (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
-    (v2f16 (mixhi_inst $src0_modifiers, $src0,
+      VT:$elt0,
+      (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+                                     (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+                                     (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+    (vecVT (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
                        DSTCLAMP.ENABLE,
@@ -268,38 +276,38 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
 
   let True16Predicate = UseRealTrue16Insts in {
   def : GCNPat <
-    (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                            (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                            (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1),
-    (v2f16 (mixlo_inst $src0_modifiers, $src0,
+    (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+                           (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+                           (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
+    (vecVT (mixlo_inst $src0_modifiers, $src0,
                 $src1_modifiers, $src1,
                 $src2_modifiers, $src2,
                 DSTCLAMP.NONE,
-                (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16)))
+                (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
   >;
 
   def : GCNPat <
-    (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
-    (v2f16 (mixhi_inst $src0_modifiers, $src0,
+    (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+                                              (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+                                              (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+    (vecVT (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
                        DSTCLAMP.NONE,
-                       (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+                       (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
   >;
 
   def : GCNPat <
     (build_vector
-      f16:$elt0,
-      (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                      (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
-    (v2f16 (mixhi_inst $src0_modifiers, $src0,
+      VT:$elt0,
+      (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+                                     (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+                                     (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
+    (vecVT (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
                        DSTCLAMP.ENABLE,
-                       (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
+                       (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
   >;
   } // end True16Predicate
 }
@@ -360,6 +368,24 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
 defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
 }
 
+let SubtargetPredicate = HasFmaMixBF16Insts in {
+let isCommutable = 1 in {
+
+let isReMaterializable = 1 in
+defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile<VOP_F32_BF16_BF16_BF16, VOP3_OPSEL>>;
+
+let FPDPRounding = 1 in {
+defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+
+let ClampLo = 0, ClampHi = 1 in {
+defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
+}
+} // End FPDPRounding = 1
+} // End isCommutable = 1
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+} // End SubtargetPredicate = HasFmaMixBF16Insts
+
 def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
   let HasModifiers = 0;
 }
@@ -2247,6 +2273,10 @@ defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>;
 defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>;
 defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>;
 
+defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
+defm V_FMA_MIXLO_BF16   : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
+defm V_FMA_MIXHI_BF16   : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
+
 defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
 defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
 
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
new file mode 100644
index 0000000000000..11cda2d4171ed
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
@@ -0,0 +1,634 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16hi_bf16hi_bf16hi_int:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.hi = lshr i32 %src0, 16
+  %src1.hi = lshr i32 %src1, 16
+  %src2.hi = lshr i32 %src2, 16
+  %src0.i16 = trunc i32 %src0.hi to i16
+  %src1.i16 = trunc i32 %src1.hi to i16
+  %src2.i16 = trunc i32 %src2.hi to i16
+  %src0.fp16 = bitcast i16 %src0.i16 to bfloat
+  %src1.fp16 = bitcast i16 %src1.i16 to bfloat
+  %src2.fp16 = bitcast i16 %src2.i16 to bfloat
+  %src0.ext = fpext bfloat %src0.fp16 to float
+  %src1.ext = fpext bfloat %src1.fp16 to float
+  %src2.ext = fpext bfloat %src2.fp16 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16hi_bf16hi_bf16hi_elt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.hi = extractelement <2 x bfloat> %src0, i32 1
+  %src1.hi = extractelement <2 x bfloat> %src1, i32 1
+  %src2.hi = extractelement <2 x bfloat> %src2, i32 1
+  %src0.ext = fpext bfloat %src0.hi to float
+  %src1.ext = fpext bfloat %src1.hi to float
+  %src2.ext = fpext bfloat %src2.hi to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define <2 x float> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+  %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+  ret <2 x float> %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_shuffle:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v5, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.shuf = shufflevector <2 x bfloat> %src0, <2 x bfloat> undef, <2 x i32> <i32 1, i32 0>
+  %src1.shuf = shufflevector <2 x bfloat> %src1, <2 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+  %src2.shuf = shufflevector <2 x bfloat> %src2, <2 x bfloat> undef, <2 x i32> <i32 1, i32 1>
+  %src0.ext = fpext <2 x bfloat> %src0.shuf to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1.shuf to <2 x float>
+  %src2.ext = fpext <2 x bfloat> %src2.shuf to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+  ret <2 x float> %result
+}
+
+define float @v_mad_mix_f32_negbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_negbf16lo_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %src0.ext.neg = fneg float %src0.ext
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_absbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_absbf16lo_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext)
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_negabsbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_negabsbf16lo_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext)
+  %src0.ext.neg.abs = fneg float %src0.ext.abs
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg.abs, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_negf32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_negf32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.neg = fneg float %src2
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_absf32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_absf32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, |v2| op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.abs = call float @llvm.fabs.f32(float %src2)
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.abs)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_negabsf32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_negabsf32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, -|v2| op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.abs = call float @llvm.fabs.f32(float %src2)
+  %src2.neg.abs = fneg float %src2.abs
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg.abs)
+  ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32imm1(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32imm1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s0, 1.0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32imminv2pi(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32imminv2pi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s0, 0.15915494
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 0x3FC45F3060000000)
+  ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s0, 0x3e230000
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2 = fpext bfloat 0xR3e23 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+  ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s0, 0x367c0000
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2 = fpext bfloat 0xR367c to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+  ret float %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_f32imm1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> <float 1.0, float 1.0>)
+  ret <2 x float> %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_cvtbf16imminv2pi(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_cvtbf16imminv2pi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX1250-NEXT:    s_mov_b32 s0, 0x3e230000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[2:3], v[4:5], s[0:1] op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+  %src2 = fpext <2 x bfloat> <bfloat 0xR3e23, bfloat 0xR3e23> to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2)
+  ret <2 x float> %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 0.15915494 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+  %src2 = fpext <2 x bfloat> <bfloat 0xR3e23, bfloat 0xR3e23> to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> <float 0x3FC45F3060000000, float 0x3FC45F3060000000>)
+  ret <2 x float> %result
+}
+
+define float @v_mad_mix_clamp_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_clamp_f32_bf16hi_bf16hi_bf16hi_elt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.hi = extractelement <2 x bfloat> %src0, i32 1
+  %src1.hi = extractelement <2 x bfloat> %src1, i32 1
+  %src2.hi = extractelement <2 x bfloat> %src2, i32 1
+  %src0.ext = fpext bfloat %src0.hi to float
+  %src1.ext = fpext bfloat %src1.hi to float
+  %src2.ext = fpext bfloat %src2.hi to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %max = call float @llvm.maxnum.f32(float %result, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  ret float %clamp
+}
+
+define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: no_mix_simple:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+  ret float %result
+}
+
+define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: no_mix_simple_fabs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.fabs = call float @llvm.fabs.f32(float %src0)
+  %result = call float @llvm.fmuladd.f32(float %src0.fabs, float %src1, float %src2)
+  ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals(bfloat %src0, bfloat %src1, bfloat %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32_denormals(bfloat %src0, bfloat %src1, float %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_denormals:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat %src0, bfloat %src1, bfloat %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals_fmulfadd:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_mul_f32 v0, v0, v1
+; GFX1250-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %mul = fmul float %src0.ext, %src1.ext
+  %result = fadd float %mul, %src2.ext
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat %src0, bfloat %src1, float %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_denormals_fmulfadd:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1250-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %mul = fmul float %src0.ext, %src1.ext
+  %result = fadd float %mul, %src2
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_flush_fmulfadd(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_flush_fmulfadd:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %mul = fmul contract float %src0.ext, %src1.ext
+  %result = fadd contract float %mul, %src2.ext
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32_flush_fmulfadd(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_flush_fmulfadd:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %mul = fmul contract float %src0.ext, %src1.ext
+  %result = fadd contract float %mul, %src2
+  ret float %result
+}
+
+define float @v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+  %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 0
+  %src0.neg = fneg bfloat %src0
+  %src0.ext = fpext bfloat %src0.neg to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+
+define float @v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+  %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1
+  %src0.neg = fneg bfloat %src0
+  %src0.ext = fpext bfloat %src0.neg to float
+  %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext)
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_precvtabsbf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_precvtabsbf16hi_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+  %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1
+  %src0.abs = call bfloat @llvm.fabs.bf16(bfloat %src0)
+  %src0.ext = fpext bfloat %src0.abs to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_preextractfneg_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_preextractfneg_bf16hi_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+  %fneg = fneg <2 x bfloat> %src0.arg.bc
+  %src0 = extractelement <2 x bfloat> %fneg, i32 1
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_preextractfabs_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_preextractfabs_bf16hi_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc)
+  %src0 = extractelement <2 x bfloat> %fabs, i32 1
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_preextractfabsfneg_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_preextractfabsfneg_bf16hi_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc)
+  %fneg.fabs = fneg <2 x bfloat> %fabs
+  %src0 = extractelement <2 x bfloat> %fneg.fabs, i32 1
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half(half %src0, half %src1, half %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v3, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_fmac_f32_e32 v0, v3, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.bf16 = bitcast half %src0 to bfloat
+  %src1.bf16 = bitcast half %src1 to bfloat
+  %src2.bf16 = bitcast half %src2 to bfloat
+  %src0.ext = fpext bfloat %src0.bf16 to float
+  %src1.ext = fpext bfloat %src1.bf16 to float
+  %src2.ext = fpext bfloat %src2.bf16 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo(half %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.bf16 = bitcast half %src0 to bfloat
+  %src0.ext = fpext bfloat %src0.bf16 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  ret float %result
+}
+
+define amdgpu_kernel void @test_fma_mix_f32_bf16_src2_bf16lo(float %x, i32 %y, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_fma_mix_f32_bf16_src2_bf16lo:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, s0, 0, s1 op_sel_hi:[0,0,1]
+; GFX1250-NEXT:    s_mov_b32 s0, 0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT:    s_endpgm
+entry:
+  %v0 = shl i32 %y, 16
+  %v1 = bitcast i32 %v0 to float
+  %mul7 = fmul contract float %x, 0.000000e+00
+  %add2 = fadd contract float %mul7, %v1
+  %v2 = fcmp uno float %add2, 0.000000e+00
+  %v3 = select i1 %v2, i64 1, i64 0
+  store i64 %v3, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+declare bfloat @llvm.fabs.bf16(bfloat) #2
+declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #2
+declare float @llvm.fabs.f32(float) #2
+declare float @llvm.minnum.f32(float, float) #2
+declare float @llvm.maxnum.f32(float, float) #2
+declare float @llvm.fmuladd.f32(float, float, float) #2
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #2
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind "denormal-fp-math-f32"="ieee,ieee" }
+attributes #2 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
new file mode 100644
index 0000000000000..5b2de59f7d271
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %cvt.result = fptrunc float %result to bfloat
+  %vec.result = insertelement <2 x bfloat> undef, bfloat %cvt.result, i32 1
+  ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %cvt.result = fptrunc float %result to bfloat
+  %vec.result = insertelement <2 x bfloat> <bfloat 1.0, bfloat undef>, bfloat %cvt.result, i32 1
+  ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo(bfloat %src0, bfloat %src1, bfloat %src2, bfloat %lo) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %cvt.result = fptrunc float %result to bfloat
+  %vec = insertelement <2 x bfloat> undef, bfloat %lo, i32 0
+  %vec.result = insertelement <2 x bfloat> %vec, bfloat %cvt.result, i32 1
+  ret <2 x bfloat> %vec.result
+}
+
+define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %cvt.result = fptrunc float %result to bfloat
+  %bc = bitcast bfloat %cvt.result to i16
+  %ext = zext i16 %bc to i32
+  %shr = shl i32 %ext, 16
+  ret i32 %shr
+}
+
+define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %cvt.result = fptrunc float %result to bfloat
+  %bc = bitcast bfloat %cvt.result to i16
+  %ext = sext i16 %bc to i32
+  %shr = shl i32 %ext, 16
+  ret i32 %shr
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %max = call float @llvm.maxnum.f32(float %result, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  %cvt.result = fptrunc float %clamp to bfloat
+  %vec.result = insertelement <2 x bfloat> undef, bfloat %cvt.result, i32 1
+  ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, 0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_min_num_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %cvt.result = fptrunc float %result to bfloat
+  %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0)
+  %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+  %vec.result = insertelement <2 x bfloat> undef, bfloat %clamp, i32 1
+  ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, 0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_min_num_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v1, off scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %cvt.result = fptrunc float %result to bfloat
+  store volatile bfloat %cvt.result, ptr addrspace(1) undef
+  %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0)
+  %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+  %vec.result = insertelement <2 x bfloat> undef, bfloat %clamp, i32 1
+  ret <2 x bfloat> %vec.result
+}
+
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat) #1
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
new file mode 100644
index 0000000000000..557080a61041c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
@@ -0,0 +1,540 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+
+define bfloat @mixlo_simple(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: mixlo_simple:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+  %cvt.result = fptrunc float %result to bfloat
+  ret bfloat %cvt.result
+}
+
+define bfloat @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) {
+; GFX1250-LABEL: mixlo_simpl_no_flush:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+  %cvt.result = fptrunc float %result to bfloat
+  ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %cvt.result = fptrunc float %result to bfloat
+  ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush(bfloat %src0, bfloat %src1, bfloat %src2) {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %src2.ext = fpext bfloat %src2 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+  %cvt.result = fptrunc float %result to bfloat
+  ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+  %cvt.result = fptrunc float %result to bfloat
+  ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, 0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_min_num_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+  %cvt.result = fptrunc float %result to bfloat
+  %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0)
+  %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+  ret bfloat %clamp
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_pre_cvt(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_pre_cvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext bfloat %src0 to float
+  %src1.ext = fpext bfloat %src1 to float
+  %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+  %max = call float @llvm.maxnum.f32(float %result, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  %cvt.result = fptrunc float %clamp to bfloat
+  ret bfloat %cvt.result
+}
+
+
+define <2 x bfloat> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+  %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+  %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+  ret <2 x bfloat> %cvt.result
+}
+
+define <3 x bfloat> @v_mad_mix_v3f32(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v3f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    v_mov_b32_e32 v0, v6
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
+  %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
+  %src2.ext = fpext <3 x bfloat> %src2 to <3 x float>
+  %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
+  %cvt.result = fptrunc <3 x float> %result to <3 x bfloat>
+  ret <3 x bfloat> %cvt.result
+}
+
+define <4 x bfloat> @v_mad_mix_v4f32(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v4f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX1250-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-NEXT:    v_and_b32_e32 v11, 0xffff0000, v5
+; GFX1250-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v12, 16, v4 :: v_dual_lshlrev_b32 v10, 16, v5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[12:13]
+; GFX1250-NEXT:    v_pk_fma_f32 v[2:3], v[6:7], v[8:9], v[10:11]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
+  %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
+  %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
+  %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
+  %cvt.result = fptrunc <4 x float> %result to <4 x bfloat>
+  ret <4 x bfloat> %cvt.result
+}
+
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, 0
+; GFX1250-NEXT:    v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+  %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+  %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+  %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %cvt.result, <2 x bfloat> zeroinitializer)
+  %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+  ret <2 x bfloat> %clamp
+}
+
+
+define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    v_pk_max_num_bf16 v1, v6, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_pk_max_num_bf16 v2, v0, 0
+; GFX1250-NEXT:    v_pk_min_num_bf16 v0, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT:    v_pk_min_num_bf16 v1, v2, 1.0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
+  %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
+  %src2.ext = fpext <3 x bfloat> %src2 to <3 x float>
+  %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
+  %cvt.result = fptrunc <3 x float> %result to <3 x bfloat>
+  %max = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %cvt.result, <3 x bfloat> zeroinitializer)
+  %clamp = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %max, <3 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0>)
+  ret <3 x bfloat> %clamp
+}
+
+define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v6, 16, v0 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v10, 16, v3
+; GFX1250-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX1250-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v12, 16, v5
+; GFX1250-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
+; GFX1250-NEXT:    v_pk_fma_f32 v[2:3], v[8:9], v[10:11], v[12:13]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, 0
+; GFX1250-NEXT:    v_pk_max_num_bf16 v1, v1, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    v_pk_min_num_bf16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
+  %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
+  %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
+  %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
+  %cvt.result = fptrunc <4 x float> %result to <4 x bfloat>
+  %max = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %cvt.result, <4 x bfloat> zeroinitializer)
+  %clamp = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %max, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>)
+  ret <4 x bfloat> %clamp
+}
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, 0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_min_num_f32_e32 v1, 1.0, v1
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+  %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+  %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+  %cvt.lo = extractelement <2 x bfloat> %cvt.result, i32 0
+  %max.lo = call bfloat @llvm.maxnum.bf16(bfloat %cvt.lo, bfloat 0.0)
+  %clamp.lo = call bfloat @llvm.minnum.bf16(bfloat %max.lo, bfloat 1.0)
+  %insert = insertelement <2 x bfloat> %cvt.result, bfloat %clamp.lo, i32 0
+  ret <2 x bfloat> %insert
+}
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, 0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_min_num_f32_e32 v1, 1.0, v1
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+  %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+  %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+  %cvt.hi = extractelement <2 x bfloat> %cvt.result, i32 1
+  %max.hi = call bfloat @llvm.maxnum.bf16(bfloat %cvt.hi, bfloat 0.0)
+  %clamp.hi = call bfloat @llvm.minnum.bf16(bfloat %max.hi, bfloat 1.0)
+  %insert = insertelement <2 x bfloat> %cvt.result, bfloat %clamp.hi, i32 1
+  ret <2 x bfloat> %insert
+}
+
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_precvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_precvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e64 v0, v0, v0 clamp
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+  %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+  %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+  %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+  %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer)
+  %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
+  %cvt.result = fptrunc <2 x float> %clamp to <2 x bfloat>
+  ret <2 x bfloat> %cvt.result
+}
+
+
+define <3 x bfloat> @v_mad_mix_v3f32_clamp_precvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v3f32_clamp_precvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT:    v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
+  %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
+  %src2.ext = fpext <3 x bfloat> %src2 to <3 x float>
+  %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
+  %max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer)
+  %clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> <float 1.0, float 1.0, float 1.0>)
+  %cvt.result = fptrunc <3 x float> %clamp to <3 x bfloat>
+  ret <3 x bfloat> %cvt.result
+}
+
+define <4 x bfloat> @v_mad_mix_v4f32_clamp_precvt(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v4f32_clamp_precvt:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v6, 16, v0 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX1250-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v10, 16, v3
+; GFX1250-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX1250-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX1250-NEXT:    v_and_b32_e32 v13, 0xffff0000, v5
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v12, 16, v5 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_pk_fma_f32 v[4:5], v[8:9], v[10:11], v[12:13]
+; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_max_num_f32_e64 v2, v5, v5 clamp
+; GFX1250-NEXT:    v_max_num_f32_e64 v1, v1, v1 clamp
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_max_num_f32_e64 v0, v0, v0 clamp
+; GFX1250-NEXT:    v_max_num_f32_e64 v3, v4, v4 clamp
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v3, v2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
+  %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
+  %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
+  %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
+  %max = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %result, <4 x float> zeroinitializer)
+  %clamp = call <4 x float> @llvm.minnum.v4f32(<4 x float> %max, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  %cvt.result = fptrunc <4 x float> %clamp to <4 x bfloat>
+  ret <4 x bfloat> %cvt.result
+}
+
+define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: mixlo_zext:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+  %cvt.result = fptrunc float %result to bfloat
+  %cvt.result.i16 = bitcast bfloat %cvt.result to i16
+  %cvt.result.i32 = zext i16 %cvt.result.i16 to i32
+  ret i32 %cvt.result.i32
+}
+
+define bfloat @mixlo_fptrunc(float %a, float %b) #0 {
+; GFX1250-LABEL: mixlo_fptrunc:
+; GFX1250:       ; %bb.0: ; %.entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, 0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+.entry:
+  %mul = fmul float %a, %b
+  %trunc = fptrunc float %mul to bfloat
+  ret bfloat %trunc
+}
+
+define bfloat @mixlo_fptrunc_no_flush(float %a, float %b) {
+; GFX1250-LABEL: mixlo_fptrunc_no_flush:
+; GFX1250:       ; %bb.0: ; %.entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, 0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+.entry:
+  %mul = fmul float %a, %b
+  %trunc = fptrunc float %mul to bfloat
+  ret bfloat %trunc
+}
+
+define bfloat @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 {
+; GFX1250-LABEL: mixlo_fptrunc_abs_src_mod:
+; GFX1250:       ; %bb.0: ; %.entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, |v0|, v1, 0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+.entry:
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %mul = fmul float %a.fabs, %b
+  %trunc = fptrunc float %mul to bfloat
+  ret bfloat %trunc
+}
+
+define bfloat @mixlo_fptrunc_neg_src_mod(float %a, float %b) #0 {
+; GFX1250-LABEL: mixlo_fptrunc_neg_src_mod:
+; GFX1250:       ; %bb.0: ; %.entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, -v0, v1, 0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+.entry:
+  %a.fneg = fneg float %a
+  %mul = fmul float %a.fneg, %b
+  %trunc = fptrunc float %mul to bfloat
+  ret bfloat %trunc
+}
+
+declare float @llvm.fabs.f32(float) #1
+
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat) #1
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>) #1
+declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>) #1
+declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>) #1
+
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) #1
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>) #1
+declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>) #1
+declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>) #1
+
+declare float @llvm.minnum.f32(float, float) #1
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
+declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1
+
+declare float @llvm.maxnum.f32(float, float) #1
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
+declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1
+
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
+declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s
index 88346941bb2cd..a17fa674e6f6b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s
@@ -1313,3 +1313,171 @@ v_pk_max3_num_f16 v1, v4, v9, v16
 v_pk_max3_num_f16 v1, v2, v5, 1.0
 // GFX1250: v_pk_max3_num_f16 v1, v2, v5, 1.0       ; encoding: [0x01,0x40,0x39,0xcc,0x02,0x0b,0xca,0x1b]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, v1, v2, s3
+// GFX1250: v_fma_mix_f32_bf16 v5, v1, v2, s3       ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, v255, v255, s105
+// GFX1250: v_fma_mix_f32_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, s1, s2, v3
+// GFX1250: v_fma_mix_f32_bf16 v5, s1, s2, v3       ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, s105, s105, m0
+// GFX1250: v_fma_mix_f32_bf16 v5, s105, s105, m0   ; encoding: [0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15
+// GFX1250: v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255
+// GFX1250: v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi
+// GFX1250: v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo|
+// GFX1250: v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc|
+// GFX1250: v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1]
+// GFX1250: v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1]
+// GFX1250: v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0]
+// GFX1250: v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0]
+// GFX1250: v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp
+// GFX1250: v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, v1, v2, s3
+// GFX1250: v_fma_mixlo_bf16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, v255, v255, s105
+// GFX1250: v_fma_mixlo_bf16 v5, v255, v255, s105   ; encoding: [0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, s1, s2, v3
+// GFX1250: v_fma_mixlo_bf16 v5, s1, s2, v3         ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, s105, s105, m0
+// GFX1250: v_fma_mixlo_bf16 v5, s105, s105, m0     ; encoding: [0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15
+// GFX1250: v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255
+// GFX1250: v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi
+// GFX1250: v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo|
+// GFX1250: v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo|  ; encoding: [0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc|
+// GFX1250: v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1]
+// GFX1250: v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1]
+// GFX1250: v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0]
+// GFX1250: v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0]
+// GFX1250: v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp
+// GFX1250: v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, v1, v2, s3
+// GFX1250: v_fma_mixhi_bf16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, v255, v255, s105
+// GFX1250: v_fma_mixhi_bf16 v5, v255, v255, s105   ; encoding: [0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, s1, s2, v3
+// GFX1250: v_fma_mixhi_bf16 v5, s1, s2, v3         ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, s105, s105, m0
+// GFX1250: v_fma_mixhi_bf16 v5, s105, s105, m0     ; encoding: [0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15
+// GFX1250: v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255
+// GFX1250: v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi
+// GFX1250: v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo|
+// GFX1250: v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo|  ; encoding: [0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc|
+// GFX1250: v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1]
+// GFX1250: v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1]
+// GFX1250: v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0]
+// GFX1250: v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0]
+// GFX1250: v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp
+// GFX1250: v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt
index d3ef89957c255..18246db41749d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt
@@ -905,3 +905,129 @@
 
 # GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04]
 0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04
+
+# GFX1250: v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61]
+0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61
+
+# GFX1250: v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53]
+0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53
+
+# GFX1250: v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21]
+0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21
+
+# GFX1250: v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9]
+0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9
+
+# GFX1250: v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3]
+0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3
+
+# GFX1250: v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9]
+0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9
+
+# GFX1250: v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03]
+0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03
+
+# GFX1250: v_fma_mix_f32_bf16 v5, s1, s2, v3       ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04]
+0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04
+
+# GFX1250: v_fma_mix_f32_bf16 v5, s105, s105, m0   ; encoding: [0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01]
+0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01
+
+# GFX1250: v_fma_mix_f32_bf16 v5, v1, v2, s3       ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00]
+0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00
+
+# GFX1250: v_fma_mix_f32_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01]
+0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01
+
+# GFX1250: v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07]
+0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07
+
+# GFX1250: v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01]
+0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01
+
+# GFX1250: v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81]
+0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81
+
+# GFX1250: v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61]
+0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61
+
+# GFX1250: v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53]
+0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53
+
+# GFX1250: v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo|  ; encoding: [0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21]
+0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21
+
+# GFX1250: v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9]
+0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9
+
+# GFX1250: v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3]
+0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3
+
+# GFX1250: v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9]
+0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9
+
+# GFX1250: v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03]
+0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03
+
+# GFX1250: v_fma_mixlo_bf16 v5, s1, s2, v3         ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04]
+0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04
+
+# GFX1250: v_fma_mixlo_bf16 v5, s105, s105, m0     ; encoding: [0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01]
+0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01
+
+# GFX1250: v_fma_mixlo_bf16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00]
+0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00
+
+# GFX1250: v_fma_mixlo_bf16 v5, v255, v255, s105   ; encoding: [0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01]
+0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01
+
+# GFX1250: v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07]
+0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07
+
+# GFX1250: v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01]
+0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01
+
+# GFX1250: v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81]
+0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81
+
+# GFX1250: v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61]
+0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61
+
+# GFX1250: v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53]
+0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53
+
+# GFX1250: v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo|  ; encoding: [0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21]
+0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21
+
+# GFX1250: v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9]
+0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9
+
+# GFX1250: v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3]
+0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3
+
+# GFX1250: v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9]
+0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9
+
+# GFX1250: v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03]
+0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03
+
+# GFX1250: v_fma_mixhi_bf16 v5, s1, s2, v3         ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04]
+0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04
+
+# GFX1250: v_fma_mixhi_bf16 v5, s105, s105, m0     ; encoding: [0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01]
+0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01
+
+# GFX1250: v_fma_mixhi_bf16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00]
+0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00
+
+# GFX1250: v_fma_mixhi_bf16 v5, v255, v255, s105   ; encoding: [0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01]
+0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01
+
+# GFX1250: v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07]
+0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07
+
+# GFX1250: v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01]
+0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01
+
+# GFX1250: v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81]
+0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81