[clang] [llvm] [AMDGPU] Introduce a new generic target `gfx9-4-generic` (PR #115190)

Shilei Tian via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 6 13:30:33 PST 2024


https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/115190

>From 04ba8ff7a056437e7d1d3ec336bc942b999f67ce Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 6 Nov 2024 16:05:19 -0500
Subject: [PATCH 1/3] [NFC][AMDGPU] Guard FP8 related instructions properly

Currently `fp8-insts` is used in the front end to guard builtins, but the
corresponding feature is never used in tablegen files to guard those
instructions. Intead, it uses `isGFX940Plus`. The gfx9-4-generic target doesn't
support those instructions, thus we need to update the guard properly.
---
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 5eee71887964ad..cdaf489792a24d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -759,6 +759,10 @@ let SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1 in {
   defm V_MFMA_I32_16X16X32I8       : MAIInst<"v_mfma_i32_16x16x32i8",       "I32_I64_X16",    int_amdgcn_mfma_i32_16x16x32_i8>;
   defm V_MFMA_F32_16X16X8XF32      : MAIInst<"v_mfma_f32_16x16x8xf32",      "F32_V2F32_X16",  int_amdgcn_mfma_f32_16x16x8_xf32>;
   defm V_MFMA_F32_32X32X4XF32      : MAIInst<"v_mfma_f32_32x32x4xf32",      "F32_V2F32_X32",  int_amdgcn_mfma_f32_32x32x4_xf32>;
+
+} // End SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1
+
+let SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 in {
   defm V_MFMA_F32_16X16X32_BF8_BF8 : MAIInst<"v_mfma_f32_16x16x32_bf8_bf8", "F32_I64_X32",    int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
   defm V_MFMA_F32_16X16X32_BF8_FP8 : MAIInst<"v_mfma_f32_16x16x32_bf8_fp8", "F32_I64_X32",    int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
   defm V_MFMA_F32_16X16X32_FP8_BF8 : MAIInst<"v_mfma_f32_16x16x32_fp8_bf8", "F32_I64_X32",    int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
@@ -767,7 +771,7 @@ let SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1 in {
   defm V_MFMA_F32_32X32X16_BF8_FP8 : MAIInst<"v_mfma_f32_32x32x16_bf8_fp8", "F32_I64_X16",    int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
   defm V_MFMA_F32_32X32X16_FP8_BF8 : MAIInst<"v_mfma_f32_32x32x16_fp8_bf8", "F32_I64_X16",    int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
   defm V_MFMA_F32_32X32X16_FP8_FP8 : MAIInst<"v_mfma_f32_32x32x16_fp8_fp8", "F32_I64_X16",    int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
-} // End SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1
+} // End SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1
 
 multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
   let Constraints = "$vdst = $src2", DisableEncoding = "$src2",
@@ -783,6 +787,9 @@ defm V_SMFMAC_F32_16X16X32_BF16    : SMFMACInst<"v_smfmac_f32_16x16x32_bf16",
 defm V_SMFMAC_F32_32X32X16_BF16    : SMFMACInst<"v_smfmac_f32_32x32x16_bf16",    "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>;
 defm V_SMFMAC_I32_16X16X64_I8      : SMFMACInst<"v_smfmac_i32_16x16x64_i8",      "I32_16X16X64_I8",  int_amdgcn_smfmac_i32_16x16x64_i8>;
 defm V_SMFMAC_I32_32X32X32_I8      : SMFMACInst<"v_smfmac_i32_32x32x32_i8",      "I32_32X32X32_I8",  int_amdgcn_smfmac_i32_32x32x32_i8>;
+}
+
+let SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 in {
 defm V_SMFMAC_F32_16X16X64_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_bf8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
 defm V_SMFMAC_F32_16X16X64_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_fp8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
 defm V_SMFMAC_F32_16X16X64_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_bf8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
@@ -791,7 +798,7 @@ defm V_SMFMAC_F32_32X32X32_BF8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_bf8",
 defm V_SMFMAC_F32_32X32X32_BF8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_fp8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
 defm V_SMFMAC_F32_32X32X32_FP8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_bf8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
 defm V_SMFMAC_F32_32X32X32_FP8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_fp8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
-}
+} // End SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1
 
 def MAIInstInfoTable : GenericTable {
   let FilterClass = "MAIInst";
@@ -1759,6 +1766,7 @@ defm V_MFMA_I32_32X32X16I8       : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x
 defm V_MFMA_I32_16X16X32I8       : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
 defm V_MFMA_F32_16X16X8XF32      : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
 defm V_MFMA_F32_32X32X4XF32      : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
+let SubtargetPredicate = HasFP8Insts in {
 defm V_MFMA_F32_16X16X32_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x70>;
 defm V_MFMA_F32_16X16X32_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x71>;
 defm V_MFMA_F32_16X16X32_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x72>;
@@ -1767,6 +1775,7 @@ defm V_MFMA_F32_32X32X16_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x74>;
 defm V_MFMA_F32_32X32X16_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x75>;
 defm V_MFMA_F32_32X32X16_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x76>;
 defm V_MFMA_F32_32X32X16_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x77>;
+} // End SubtargetPredicate = HasFP8Insts
 
 defm V_MFMA_F32_32X32X4BF16_1K   : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">;
 defm V_MFMA_F32_16X16X4BF16_1K   : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">;
@@ -1783,6 +1792,7 @@ defm V_SMFMAC_F32_16X16X32_BF16    : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x1
 defm V_SMFMAC_F32_32X32X16_BF16    : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">;
 defm V_SMFMAC_I32_16X16X64_I8      : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">;
 defm V_SMFMAC_I32_32X32X32_I8      : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">;
+let SubtargetPredicate = HasFP8Insts in {
 defm V_SMFMAC_F32_16X16X64_BF8_BF8 : VOP3P_Real_SMFMAC <0x78, "v_smfmac_f32_16x16x64bf8bf8">;
 defm V_SMFMAC_F32_16X16X64_BF8_FP8 : VOP3P_Real_SMFMAC <0x79, "v_smfmac_f32_16x16x64bf8fp8">;
 defm V_SMFMAC_F32_16X16X64_FP8_BF8 : VOP3P_Real_SMFMAC <0x7a, "v_smfmac_f32_16x16x64fp8bf8">;
@@ -1791,6 +1801,7 @@ defm V_SMFMAC_F32_32X32X32_BF8_BF8 : VOP3P_Real_SMFMAC <0x7c, "v_smfmac_f32_32x3
 defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x32x32bf8fp8">;
 defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">;
 defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">;
+} // End SubtargetPredicate = HasFP8Insts
 
 defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
 defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;

>From 4077a199a263252ef45895e2c9b4e6375988fa88 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 6 Nov 2024 16:15:50 -0500
Subject: [PATCH 2/3] [AMDGPU] Introduce a "new" target feature `xf32-insts`

The feature itself is not new. Just to use it to guard corresponding
instructions.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td            | 11 +++++++++++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h       |  4 ++++
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td |  8 ++++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 910f5e06a6f3c4..d068402e95716e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1110,6 +1110,13 @@ def FeatureRequiresCOV6 : SubtargetFeature<"requires-cov6",
   "Target Requires Code Object V6"
 >;
 
+def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
+   "HasXF32Insts",
+   "true",
+   "Has instructions that support xf32 format, such as "
+   "v_mfma_f32_16x16x8xf32 and v_mfma_f32_32x32x4xf32"
+ >;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
@@ -1448,6 +1455,7 @@ def FeatureISAVersion9_4_Common : FeatureSet<
    FeatureFP8ConversionInsts,
    FeatureCvtFP8VOP1Bug,
    FeaturePkFmacF16Inst,
+   FeatureXF32Insts,
    FeatureAtomicFaddRtnInsts,
    FeatureAtomicFaddNoRtnInsts,
    FeatureAtomicBufferGlobalPkAddF16Insts,
@@ -2289,6 +2297,9 @@ def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
 
 def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
 
+def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
+   AssemblerPredicate<(all_of FeatureXF32Insts)>;
+
 // Include AMDGPU TD files
 include "SISchedule.td"
 include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 1ea3beb2855d69..6ff964077d8fd0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -179,6 +179,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasDefaultComponentZero = false;
   bool HasAgentScopeFineGrainedRemoteMemoryAtomics = false;
   bool HasDefaultComponentBroadcast = false;
+  bool HasXF32Insts = false;
   /// The maximum number of instructions that may be placed within an S_CLAUSE,
   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
   /// indicates a lack of S_CLAUSE support.
@@ -1302,6 +1303,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() == GFX12;
   }
 
+  /// \returns true if the target has instructions with xf32 format support.
+  bool hasXF32Insts() const { return HasXF32Insts; }
+
   /// \returns The maximum number of instructions that can be enclosed in an
   /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
   /// instruction.
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index cdaf489792a24d..e246d433401f94 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -757,10 +757,12 @@ let Predicates = [isGFX90APlus] in {
 let SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1 in {
   defm V_MFMA_I32_32X32X16I8       : MAIInst<"v_mfma_i32_32x32x16i8",       "I32_I64_X32",    int_amdgcn_mfma_i32_32x32x16_i8>;
   defm V_MFMA_I32_16X16X32I8       : MAIInst<"v_mfma_i32_16x16x32i8",       "I32_I64_X16",    int_amdgcn_mfma_i32_16x16x32_i8>;
+} // End SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1
+
+let SubtargetPredicate = HasXF32Insts, is_gfx940_xdl = 1 in {
   defm V_MFMA_F32_16X16X8XF32      : MAIInst<"v_mfma_f32_16x16x8xf32",      "F32_V2F32_X16",  int_amdgcn_mfma_f32_16x16x8_xf32>;
   defm V_MFMA_F32_32X32X4XF32      : MAIInst<"v_mfma_f32_32x32x4xf32",      "F32_V2F32_X32",  int_amdgcn_mfma_f32_32x32x4_xf32>;
-
-} // End SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1
+} // End SubtargetPredicate = HasXF32Insts, is_gfx940_xdl = 1
 
 let SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 in {
   defm V_MFMA_F32_16X16X32_BF8_BF8 : MAIInst<"v_mfma_f32_16x16x32_bf8_bf8", "F32_I64_X32",    int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
@@ -1764,8 +1766,10 @@ defm V_MFMA_F64_4X4X4F64        : VOP3P_Real_MFMA_gfx90a <0x6f>;
 
 defm V_MFMA_I32_32X32X16I8       : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
 defm V_MFMA_I32_16X16X32I8       : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
+let SubtargetPredicate = HasXF32Insts in {
 defm V_MFMA_F32_16X16X8XF32      : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
 defm V_MFMA_F32_32X32X4XF32      : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
+} // End SubtargetPredicate = HasXF32Insts
 let SubtargetPredicate = HasFP8Insts in {
 defm V_MFMA_F32_16X16X32_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x70>;
 defm V_MFMA_F32_16X16X32_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x71>;

>From c3a5591b86796d815becceae312a1458f1e6268a Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 6 Nov 2024 12:49:45 -0500
Subject: [PATCH 3/3] [AMDGPU] Introduce a new generic target `gfx9-4-generic`

---
 clang/include/clang/Basic/Cuda.h              |    1 +
 clang/lib/Basic/Cuda.cpp                      |    1 +
 clang/lib/Basic/Targets/NVPTX.cpp             |    1 +
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |    1 +
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |    3 +
 clang/test/Driver/amdgpu-macros.cl            |    1 +
 clang/test/Driver/amdgpu-mcpu.cl              |    2 +
 .../Misc/target-invalid-cpu-note/amdgcn.c     |    1 +
 .../test/Misc/target-invalid-cpu-note/nvptx.c |    1 +
 llvm/docs/AMDGPUUsage.rst                     |   10 +
 llvm/include/llvm/BinaryFormat/ELF.h          |    3 +-
 llvm/include/llvm/TargetParser/TargetParser.h |    3 +-
 llvm/lib/Object/ELFObjectFile.cpp             |    2 +
 llvm/lib/ObjectYAML/ELFYAML.cpp               |    1 +
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   33 +-
 llvm/lib/Target/AMDGPU/GCNProcessors.td       |    5 +
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |    5 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |    1 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |    5 +
 llvm/lib/TargetParser/TargetParser.cpp        |    5 +
 .../CodeGen/AMDGPU/directive-amdgcn-target.ll |    4 +
 .../CodeGen/AMDGPU/div-rem-by-constant-64.ll  |  443 +++++++
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   |    1 +
 .../CodeGen/AMDGPU/elf-header-flags-mach.ll   |    2 +
 .../AMDGPU/generic-targets-require-v6.ll      |    3 +
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 1144 +++++++++++++++++
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  496 +++++++
 .../AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll   |    1 +
 .../AMDGPU/no-corresponding-integer-type.ll   |    1 +
 .../MC/AMDGPU/gfx9_4_generic_unsupported.s    |   56 +
 .../Object/AMDGPU/elf-header-flags-mach.yaml  |    7 +
 .../llvm-objdump/ELF/AMDGPU/subtarget.ll      |    6 +
 .../llvm-readobj/ELF/AMDGPU/elf-headers.test  |    3 +
 llvm/tools/llvm-readobj/ELFDumper.cpp         |    1 +
 34 files changed, 2245 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s

diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 7b4f435dc39f29..721e8981af6ffc 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -103,6 +103,7 @@ enum class OffloadArch {
   GFX909,
   GFX90a,
   GFX90c,
+  GFX9_4_GENERIC,
   GFX940,
   GFX941,
   GFX942,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index d765baef913e2f..59c932468cd891 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -121,6 +121,7 @@ static const OffloadArchToStringMap arch_names[] = {
     GFX(909),  // gfx909
     GFX(90a),  // gfx90a
     GFX(90c),  // gfx90c
+    {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"},
     GFX(940),  // gfx940
     GFX(941),  // gfx941
     GFX(942),  // gfx942
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index e0bd0b096324d8..0897032c4b8546 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -205,6 +205,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case OffloadArch::GFX909:
       case OffloadArch::GFX90a:
       case OffloadArch::GFX90c:
+      case OffloadArch::GFX9_4_GENERIC:
       case OffloadArch::GFX940:
       case OffloadArch::GFX941:
       case OffloadArch::GFX942:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 598b946ad88dbb..43dc0e62284602 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2301,6 +2301,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
       case OffloadArch::GFX909:
       case OffloadArch::GFX90a:
       case OffloadArch::GFX90c:
+      case OffloadArch::GFX9_4_GENERIC:
       case OffloadArch::GFX940:
       case OffloadArch::GFX941:
       case OffloadArch::GFX942:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index f3473346baae5a..692f5103724342 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -56,6 +56,8 @@
 
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103-W64 %s
 
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx9-4-generic -emit-llvm -o - %s | FileCheck --check-prefix=GFX9_4_Generic %s
+
 // NOCPU-NOT: "target-features"
 // NOCPU-WAVE32: "target-features"="+wavefrontsize32"
 // NOCPU-WAVE64: "target-features"="+wavefrontsize64"
@@ -85,6 +87,7 @@
 // GFX940: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl
index dd5a4483e4d607..9352c59ef28b7f 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -133,6 +133,7 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1201 -DFAMILY=GFX12
 
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_generic -DFAMILY=GFX9
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-4-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_generic -DFAMILY=GFX9
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx10-1-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_1_generic -DFAMILY=GFX10
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx10-3-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_3_generic -DFAMILY=GFX10
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx11_generic -DFAMILY=GFX11
diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl
index 42ce33db6eec07..ba578435072985 100644
--- a/clang/test/Driver/amdgpu-mcpu.cl
+++ b/clang/test/Driver/amdgpu-mcpu.cl
@@ -118,6 +118,7 @@
 // RUN: %clang -### -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefix=GFX1201 %s
 
 // RUN: %clang -### -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefix=GFX9_GENERIC %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx9-4-generic %s 2>&1 | FileCheck --check-prefix=GFX9_4_GENERIC %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx10-1-generic %s 2>&1 | FileCheck --check-prefix=GFX10_1_GENERIC %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx10-3-generic %s 2>&1 | FileCheck --check-prefix=GFX10_3_GENERIC %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefix=GFX11_GENERIC %s
@@ -172,6 +173,7 @@
 // GFX1201:   "-target-cpu" "gfx1201"
 
 // GFX9_GENERIC:      "-target-cpu" "gfx9-generic"
+// GFX9_4_GENERIC:    "-target-cpu" "gfx9-4-generic"
 // GFX10_1_GENERIC:   "-target-cpu" "gfx10-1-generic"
 // GFX10_3_GENERIC:   "-target-cpu" "gfx10-3-generic"
 // GFX11_GENERIC:     "-target-cpu" "gfx11-generic"
diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
index b3ddbd53a0391b..cae31797c1febe 100644
--- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
+++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
@@ -70,6 +70,7 @@
 // CHECK-SAME: {{^}}, gfx1200
 // CHECK-SAME: {{^}}, gfx1201
 // CHECK-SAME: {{^}}, gfx9-generic
+// CHECK-SAME: {{^}}, gfx9-4-generic
 // CHECK-SAME: {{^}}, gfx10-1-generic
 // CHECK-SAME: {{^}}, gfx10-3-generic
 // CHECK-SAME: {{^}}, gfx11-generic
diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
index a59e1c6fab1c49..44fe07065b2428 100644
--- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c
+++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
@@ -50,6 +50,7 @@
 // CHECK-SAME: {{^}}, gfx909
 // CHECK-SAME: {{^}}, gfx90a
 // CHECK-SAME: {{^}}, gfx90c
+// CHECK-SAME: {{^}}, gfx9-4-generic
 // CHECK-SAME: {{^}}, gfx940
 // CHECK-SAME: {{^}}, gfx941
 // CHECK-SAME: {{^}}, gfx942
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5b83ea428c0bff..5bc57decda80d0 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -576,6 +576,16 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor
                                                                                                   - ``v_dot2_f32_f16``
 
 
+     ``gfx9-4-generic``   ``amdgcn``     - ``gfx940``      - xnack            - Absolute flat   - The following instructions are not available:
+                                         - ``gfx941``                           scratch
+                                         - ``gfx942``                                             - ``v_mfma_f32_16x16x8_xf32``
+                                                                                                  - ``v_mfma_f32_32x32x4xf32``
+                                                                                                  - ``v_cvt_f32_fp8``
+                                                                                                  - ``v_cvt_f32_bf8``
+                                                                                                  - ``v_cvt_pk_f32_fp8``
+                                                                                                  - ``v_cvt_pk_f32_bf8``
+
+
      ``gfx10-1-generic``  ``amdgcn``     - ``gfx1010``     - xnack            - Absolute flat   - The following instructions are
                                          - ``gfx1011``     - wavefrontsize64    scratch           not available on ``gfx1011``
                                          - ``gfx1012``     - cumode                               and ``gfx1012``
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index c591a96232f115..6c05ea7208e1f1 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -822,11 +822,12 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57   = 0x057,
   EF_AMDGPU_MACH_AMDGCN_GFX1153         = 0x058,
   EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC   = 0x059,
+  EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC  = 0x05f,
   // clang-format on
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index ae86ff39083d89..c6db4dfd7f5159 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -119,9 +119,10 @@ enum GPUKind : uint32_t {
   GK_GFX10_3_GENERIC = 194,
   GK_GFX11_GENERIC = 195,
   GK_GFX12_GENERIC = 196,
+  GK_GFX9_4_GENERIC = 197,
 
   GK_AMDGCN_GENERIC_FIRST = GK_GFX9_GENERIC,
-  GK_AMDGCN_GENERIC_LAST = GK_GFX12_GENERIC,
+  GK_AMDGCN_GENERIC_LAST = GK_GFX9_4_GENERIC,
 };
 
 /// Instruction set architecture version.
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 5096877d2a4b00..9dc39936ffd8bb 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -602,6 +602,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
   // Generic AMDGCN targets
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:
     return "gfx9-generic";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC:
+    return "gfx9-4-generic";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC:
     return "gfx10-1-generic";
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC:
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index e97248cbcf5682..76c6c496c5e6bc 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -631,6 +631,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, EF_AMDGPU_MACH);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d068402e95716e..2fd28df338921a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1451,9 +1451,6 @@ def FeatureISAVersion9_4_Common : FeatureSet<
    FeatureDPALU_DPP,
    FeaturePackedFP32Ops,
    FeatureMAIInsts,
-   FeatureFP8Insts,
-   FeatureFP8ConversionInsts,
-   FeatureCvtFP8VOP1Bug,
    FeaturePkFmacF16Inst,
    FeatureXF32Insts,
    FeatureAtomicFaddRtnInsts,
@@ -1476,15 +1473,36 @@ def FeatureISAVersion9_4_Common : FeatureSet<
 
 def FeatureISAVersion9_4_0 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
-    [FeatureForceStoreSC0SC1])>;
+    [
+      FeatureForceStoreSC0SC1,
+      FeatureFP8Insts,
+      FeatureFP8ConversionInsts,
+      FeatureCvtFP8VOP1Bug,
+      FeatureXF32Insts
+    ])>;
 
 def FeatureISAVersion9_4_1 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
-    [FeatureForceStoreSC0SC1])>;
+    [
+      FeatureForceStoreSC0SC1,
+      FeatureFP8Insts,
+      FeatureFP8ConversionInsts,
+      FeatureCvtFP8VOP1Bug,
+      FeatureXF32Insts
+    ])>;
 
 def FeatureISAVersion9_4_2 : FeatureSet<
   !listconcat(FeatureISAVersion9_4_Common.Features,
-    [])>;
+    [
+      FeatureFP8Insts,
+      FeatureFP8ConversionInsts,
+      FeatureCvtFP8VOP1Bug,
+      FeatureXF32Insts
+    ])>;
+
+def FeatureISAVersion9_4_Generic : FeatureSet<
+  !listconcat(FeatureISAVersion9_4_Common.Features,
+    [FeatureRequiresCOV6])>;
 
 def FeatureISAVersion10_Common : FeatureSet<
   [FeatureGFX10,
@@ -2029,6 +2047,9 @@ def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">,
 def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">,
   AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>;
 
+def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
+  AssemblerPredicate<(all_of FeatureXF32Insts)>;
+
 def D16PreservesUnusedBits :
   Predicate<"Subtarget->d16PreservesUnusedBits()">,
   AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>;
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 547941633fda61..e12a6127b17063 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -209,6 +209,11 @@ def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel,
   FeatureISAVersion9_Generic.Features
 >;
 
+// [gfx940, gfx941, gfx942]
+def : ProcessorModel<"gfx9-4-generic", SIQuarterSpeedModel,
+  FeatureISAVersion9_4_Generic.Features
+>;
+
 //===----------------------------------------------------------------------===//
 // GCN GFX10.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 5c625c3d83ff1b..507725b91a9bee 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -119,6 +119,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:     AK = GK_GFX9_GENERIC; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC:   AK = GK_GFX9_4_GENERIC; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC:  AK = GK_GFX10_1_GENERIC; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC:  AK = GK_GFX10_3_GENERIC; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC:    AK = GK_GFX11_GENERIC; break;
@@ -204,6 +205,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
   case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
   case GK_GFX9_GENERIC:     return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC;
+  case GK_GFX9_4_GENERIC:   return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC;
   case GK_GFX10_1_GENERIC:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC;
   case GK_GFX10_3_GENERIC:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC;
   case GK_GFX11_GENERIC:    return ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC;
@@ -821,6 +823,9 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV6() {
     case AMDGPU::GK_GFX9_GENERIC:
       Version = GenericVersion::GFX9;
       break;
+    case AMDGPU::GK_GFX9_4_GENERIC:
+      Version = GenericVersion::GFX9_4;
+      break;
     case AMDGPU::GK_GFX10_1_GENERIC:
       Version = GenericVersion::GFX10_1;
       break;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index beebe320b2cf3a..88a6d75b72c7d0 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -46,6 +46,7 @@ struct IsaVersion;
 /// within a generic family.
 namespace GenericVersion {
 static constexpr unsigned GFX9 = 1;
+static constexpr unsigned GFX9_4 = 1;
 static constexpr unsigned GFX10_1 = 1;
 static constexpr unsigned GFX10_3 = 1;
 static constexpr unsigned GFX11 = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index e246d433401f94..e15463a9c39262 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -775,6 +775,11 @@ let SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1 in {
   defm V_MFMA_F32_32X32X16_FP8_FP8 : MAIInst<"v_mfma_f32_32x32x16_fp8_fp8", "F32_I64_X16",    int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
 } // End SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1
 
+let SubtargetPredicate = HasXF32Insts, is_gfx940_xdl = 1 in {
+  defm V_MFMA_F32_16X16X8XF32      : MAIInst<"v_mfma_f32_16x16x8xf32",      "F32_V2F32_X16",  int_amdgcn_mfma_f32_16x16x8_xf32>;
+  defm V_MFMA_F32_32X32X4XF32      : MAIInst<"v_mfma_f32_32x32x4xf32",      "F32_V2F32_X32",  int_amdgcn_mfma_f32_32x32x4_xf32>;
+} // End SubtargetPredicate = HasXF32Insts, is_gfx940_xdl = 1
+
 multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
   let Constraints = "$vdst = $src2", DisableEncoding = "$src2",
       isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1, is_gfx940_xdl = 1 in {
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 059d7b4f5ff2d0..e7a979eaedb75c 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -130,6 +130,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
     {{"gfx1201"},   {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
 
     {{"gfx9-generic"},      {"gfx9-generic"},    GK_GFX9_GENERIC,    FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+    {{"gfx9-4-generic"},    {"gfx9-4-generic"},  GK_GFX9_4_GENERIC,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
     {{"gfx10-1-generic"},   {"gfx10-1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
     {{"gfx10-3-generic"},   {"gfx10-3-generic"}, GK_GFX10_3_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
     {{"gfx11-generic"},     {"gfx11-generic"},   GK_GFX11_GENERIC,   FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
@@ -156,6 +157,8 @@ StringRef llvm::AMDGPU::getArchFamilyNameAMDGCN(GPUKind AK) {
   switch (AK) {
   case AMDGPU::GK_GFX9_GENERIC:
     return "gfx9";
+  case AMDGPU::GK_GFX9_4_GENERIC:
+    return "gfx9";
   case AMDGPU::GK_GFX10_1_GENERIC:
   case AMDGPU::GK_GFX10_3_GENERIC:
     return "gfx10";
@@ -296,6 +299,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   // TODO: Split up this API depending on its caller so
   // generic target handling is more obvious and less risky.
   case GK_GFX9_GENERIC:    return {9, 0, 0};
+  case GK_GFX9_4_GENERIC:  return {9, 4, 0};
   case GK_GFX10_1_GENERIC: return {10, 1, 0};
   case GK_GFX10_3_GENERIC: return {10, 3, 0};
   case GK_GFX11_GENERIC:   return {11, 0, 3};
@@ -466,6 +470,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     case GK_GFX942:
     case GK_GFX941:
     case GK_GFX940:
+    case GK_GFX9_4_GENERIC:
       Features["gfx940-insts"] = true;
       Features["fp8-insts"] = true;
       Features["fp8-conversion-insts"] = true;
diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
index 5986d2d38ef1ad..4eac26e853c2a0 100644
--- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -112,6 +112,8 @@
 
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_NOXNACK %s
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_XNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX9_4_GENERIC_NOXNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX9_4_GENERIC_XNACK %s
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_NOXNACK %s
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_XNACK %s
 ; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic < %s | FileCheck --check-prefixes=GFX10_3_GENERIC %s
@@ -210,6 +212,8 @@
 
 ; GFX9_GENERIC_NOXNACK:     .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack-"
 ; GFX9_GENERIC_XNACK:       .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack+"
+; GFX9_4_GENERIC_NOXNACK:   .amdgcn_target "amdgcn-amd-amdhsa--gfx9-4-generic:xnack-"
+; GFX9_4_GENERIC_XNACK:     .amdgcn_target "amdgcn-amd-amdhsa--gfx9-4-generic:xnack+"
 ; GFX10_1_GENERIC_NOXNACK:  .amdgcn_target "amdgcn-amd-amdhsa--gfx10-1-generic:xnack-"
 ; GFX10_1_GENERIC_XNACK:    .amdgcn_target "amdgcn-amd-amdhsa--gfx10-1-generic:xnack+"
 ; GFX10_3_GENERIC:          .amdgcn_target "amdgcn-amd-amdhsa--gfx10-3-generic"
diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
index 662de47413654f..10ef4dc280d091 100644
--- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -o - %s | FileCheck -check-prefixes=GFX9_4 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -o - %s | FileCheck -check-prefixes=GFX1030 %s
 
 ; Sample test to check how we deal with division/modulos by 64 bit constants.
@@ -66,6 +67,37 @@ define noundef i64 @srem64_3(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: srem64_3:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    s_mov_b32 s2, 0x55555556
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, s2
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    s_mov_b32 s3, 0x55555555
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5]
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5]
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_mul_lo_u32 v6, v2, s3
+; GFX9_4-NEXT:    v_mul_lo_u32 v7, v2, s2
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, s2, v[4:5]
+; GFX9_4-NEXT:    v_add3_u32 v5, v7, v5, v6
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3]
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT:    s_nop 0
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: srem64_3:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -158,6 +190,37 @@ define noundef i64 @srem64_6(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: srem64_6:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    s_mov_b32 s2, 0x55555556
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, s2
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    s_mov_b32 s3, 0x55555555
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5]
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5]
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_mul_lo_u32 v6, v2, s3
+; GFX9_4-NEXT:    v_mul_lo_u32 v7, v2, s2
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, s2, v[4:5]
+; GFX9_4-NEXT:    v_add3_u32 v5, v7, v5, v6
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3]
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT:    s_nop 0
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: srem64_6:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -240,6 +303,32 @@ define noundef i64 @urem64_3(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: urem64_3:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    s_mov_b32 s0, 0xaaaaaaab
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, s0
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    s_mov_b32 s2, 0xaaaaaaaa
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT:    v_alignbit_b32 v2, v3, v2, 1
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3]
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT:    s_nop 0
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: urem64_3:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -317,6 +406,32 @@ define noundef i64 @urem64_6(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: urem64_6:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    s_mov_b32 s0, 0xaaaaaaab
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, s0
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    s_mov_b32 s2, 0xaaaaaaaa
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT:    v_alignbit_b32 v2, v3, v2, 2
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, 6, 0
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, 6, v[2:3]
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT:    s_nop 0
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: urem64_6:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -393,6 +508,30 @@ define noundef i64 @sdiv64_3(i64 noundef %i)  {
 ; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: sdiv64_3:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    s_mov_b32 s2, 0x55555556
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, s2
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    s_mov_b32 s3, 0x55555555
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5]
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5]
+; GFX9_4-NEXT:    v_mul_lo_u32 v2, v0, s3
+; GFX9_4-NEXT:    v_mul_lo_u32 v6, v0, s2
+; GFX9_4-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT:    v_add3_u32 v1, v6, v1, v2
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: sdiv64_3:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -470,6 +609,30 @@ define noundef i64 @sdiv64_6(i64 noundef %i)  {
 ; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: sdiv64_6:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    s_mov_b32 s2, 0x55555556
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, s2
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    s_mov_b32 s3, 0x55555555
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5]
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5]
+; GFX9_4-NEXT:    v_mul_lo_u32 v2, v0, s3
+; GFX9_4-NEXT:    v_mul_lo_u32 v6, v0, s2
+; GFX9_4-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT:    v_add3_u32 v1, v6, v1, v2
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: sdiv64_6:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -536,6 +699,25 @@ define noundef i64 @udiv64_3(i64 noundef %i)  {
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: udiv64_3:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    s_mov_b32 s0, 0xaaaaaaab
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, s0
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    s_mov_b32 s2, 0xaaaaaaaa
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT:    v_alignbit_b32 v0, v1, v0, 1
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: udiv64_3:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -596,6 +778,25 @@ define noundef i64 @udiv64_6(i64 noundef %i)  {
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: udiv64_6:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    s_mov_b32 s0, 0xaaaaaaab
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, s0
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    s_mov_b32 s2, 0xaaaaaaaa
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT:    v_alignbit_b32 v0, v1, v0, 2
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: udiv64_6:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -641,6 +842,18 @@ define noundef i64 @srem64_2(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: srem64_2:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    v_and_b32_e32 v2, -2, v2
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9_4-NEXT:    s_nop 1
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: srem64_2:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -675,6 +888,15 @@ define noundef i64 @sdiv64_2(i64 noundef %i)  {
 ; GFX942-NEXT:    v_ashrrev_i64 v[0:1], 1, v[0:1]
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: sdiv64_2:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: sdiv64_2:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -703,6 +925,13 @@ define noundef i64 @urem64_2(i64 noundef %i)  {
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: urem64_2:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9_4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: urem64_2:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -729,6 +958,13 @@ define noundef i64 @udiv64_2(i64 noundef %i)  {
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: udiv64_2:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_alignbit_b32 v0, v1, v0, 1
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: udiv64_2:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -766,6 +1002,19 @@ define noundef i64 @srem64_64(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: srem64_64:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 26, v2
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    v_and_b32_e32 v2, 0xffffffc0, v2
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9_4-NEXT:    s_nop 1
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: srem64_64:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -803,6 +1052,16 @@ define noundef i64 @sdiv64_64(i64 noundef %i)  {
 ; GFX942-NEXT:    v_ashrrev_i64 v[0:1], 6, v[0:1]
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: sdiv64_64:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 26, v2
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    v_ashrrev_i64 v[0:1], 6, v[0:1]
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: sdiv64_64:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -832,6 +1091,13 @@ define noundef i64 @urem64_64(i64 noundef %i)  {
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: urem64_64:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9_4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: urem64_64:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -858,6 +1124,13 @@ define noundef i64 @udiv64_64(i64 noundef %i)  {
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 6, v1
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: udiv64_64:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_alignbit_b32 v0, v1, v0, 6
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v1, 6, v1
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: udiv64_64:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -895,6 +1168,19 @@ define noundef i64 @srem64_i32min(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: srem64_i32min:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    v_and_b32_e32 v2, 0x80000000, v2
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9_4-NEXT:    s_nop 1
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: srem64_i32min:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -937,6 +1223,19 @@ define noundef i64 @sdiv64_i32min(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: sdiv64_i32min:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, 0, v0
+; GFX9_4-NEXT:    s_nop 1
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: sdiv64_i32min:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -979,6 +1278,18 @@ define noundef i64 @urem64_i32min(i64 noundef %i)  {
 ; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: urem64_i32min:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_alignbit_b32 v4, v1, v0, 31
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v4, 1, v[2:3]
+; GFX9_4-NEXT:    v_add_lshl_u32 v2, v2, v5, 30
+; GFX9_4-NEXT:    v_and_b32_e32 v2, 0x80000000, v2
+; GFX9_4-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: urem64_i32min:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1025,6 +1336,20 @@ define noundef i64 @udiv64_i32min(i64 noundef %i)  {
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: udiv64_i32min:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_alignbit_b32 v2, v1, v0, 31
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
+; GFX9_4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, 1, v[0:1]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9_4-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    v_alignbit_b32 v0, v1, v0, 1
+; GFX9_4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: udiv64_i32min:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1117,6 +1442,43 @@ define noundef i64 @srem64_i32max(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: srem64_i32max:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, 3
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX9_4-NEXT:    v_lshl_add_u32 v8, v6, 31, v6
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, 3, v[2:3]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v6, 3, 0
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    s_mov_b32 s2, 0x80000001
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT:    v_add3_u32 v7, v7, v8, v6
+; GFX9_4-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v0, -1, v[6:7]
+; GFX9_4-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[4:5]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[4:5]
+; GFX9_4-NEXT:    v_sub_u32_e32 v2, v7, v1
+; GFX9_4-NEXT:    v_sub_u32_e32 v7, v2, v0
+; GFX9_4-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, 1, v[4:5]
+; GFX9_4-NEXT:    s_brev_b32 s2, -2
+; GFX9_4-NEXT:    v_add_u32_e32 v5, v1, v5
+; GFX9_4-NEXT:    v_ashrrev_i64 v[6:7], 30, v[4:5]
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[6:7], 0, v[2:3]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3]
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT:    s_nop 0
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: srem64_i32max:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1216,6 +1578,35 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i)  {
 ; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: sdiv64_i32max:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, 3
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX9_4-NEXT:    v_lshl_add_u32 v8, v6, 31, v6
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, 3, v[2:3]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v6, 3, 0
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    s_mov_b32 s2, 0x80000001
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT:    v_add3_u32 v7, v7, v8, v6
+; GFX9_4-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v0, -1, v[6:7]
+; GFX9_4-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[4:5]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[4:5]
+; GFX9_4-NEXT:    v_sub_u32_e32 v2, v7, v1
+; GFX9_4-NEXT:    v_sub_u32_e32 v7, v2, v0
+; GFX9_4-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, 1, v[4:5]
+; GFX9_4-NEXT:    v_add_u32_e32 v5, v1, v5
+; GFX9_4-NEXT:    v_ashrrev_i64 v[0:1], 30, v[4:5]
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v2, 31, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: sdiv64_i32max:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1308,6 +1699,36 @@ define noundef i64 @urem64_i32max(i64 noundef %i)  {
 ; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: urem64_i32max:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, 5
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    s_brev_b32 s2, -2
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, 5, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, 2, v[4:5]
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v1, 2, v[2:3]
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9_4-NEXT:    s_nop 1
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX9_4-NEXT:    v_alignbit_b32 v2, v3, v2, 30
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3]
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT:    s_nop 0
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: urem64_i32max:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1383,6 +1804,28 @@ define noundef i64 @udiv64_i32max(i64 noundef %i)  {
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX9_4-LABEL: udiv64_i32max:
+; GFX9_4:       ; %bb.0: ; %entry
+; GFX9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT:    v_mul_hi_u32 v2, v0, 5
+; GFX9_4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, 5, v[2:3]
+; GFX9_4-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, 2, v[4:5]
+; GFX9_4-NEXT:    v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v1, 2, v[2:3]
+; GFX9_4-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9_4-NEXT:    s_nop 1
+; GFX9_4-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX9_4-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT:    v_alignbit_b32 v0, v1, v0, 30
+; GFX9_4-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
+; GFX9_4-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1030-LABEL: udiv64_i32max:
 ; GFX1030:       ; %bb.0: ; %entry
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index e24817078d8bc9..524e074bb69de4 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx9-4-generic -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s
 
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index f293c52bf6bfb2..f1f4edb94a6178 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -80,6 +80,7 @@
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1201 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1201 %s
 
 ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_GENERIC %s
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-4-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_4_GENERIC %s
 ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10-1-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_1_GENERIC %s
 ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10-3-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_3_GENERIC %s
 ; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx11-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX11_GENERIC %s
@@ -161,6 +162,7 @@
 ; GFX1201:       EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E)
 
 ; GFX9_GENERIC:       EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
+; GFX9_4_GENERIC:     EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC (0x5F)
 ; GFX10_1_GENERIC:    EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52)
 ; GFX10_3_GENERIC:    EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC (0x53)
 ; GFX11_GENERIC:      EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC (0x54)
diff --git a/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
index d30cf1531a06b1..d58dc5db8c80f1 100644
--- a/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
+++ b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
@@ -1,16 +1,19 @@
 ; RUN: not llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX9-V5 %s
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX9-4-V5 %s
 ; RUN: not llc -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX101-V5 %s
 ; RUN: not llc -mtriple=amdgcn -mcpu=gfx10-3-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX103-V5 %s
 ; RUN: not llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX11-V5 %s
 ; RUN: not llc -mtriple=amdgcn -mcpu=gfx12-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX12-V5 %s
 
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -o - %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -o - %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=6 -o - %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx10-3-generic --amdhsa-code-object-version=6 -o - %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx12-generic --amdhsa-code-object-version=6 -o - %s
 
 ; GFX9-V5:   gfx9-generic is only available on code object version 6 or better
+; GFX9-4-V5: gfx9-4-generic is only available on code object version 6 or better
 ; GFX101-V5: gfx10-1-generic is only available on code object version 6 or better
 ; GFX103-V5: gfx10-3-generic is only available on code object version 6 or better
 ; GFX11-V5:  gfx11-generic is only available on code object version 6 or better
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index da1d9972e42dcf..c3f1d47f35ed28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN9_4 %s
 
 --- |
   define amdgpu_kernel void @largeInterleave() #0 { ret void }
@@ -1145,6 +1146,1149 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    s_endpgm
+  ;
+  ; GCN9_4-LABEL: largeInterleave:
+  ; GCN9_4:       ; %bb.0:
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr0
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr1
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr8
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr94
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr132
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr133
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr139
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_readfirstlane_b32 s7, v0
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr0
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
+  ; GCN9_4-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr5
+  ; GCN9_4-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN9_4-NEXT:    s_nop 1
+  ; GCN9_4-NEXT:    v_lshl_add_u32 v0, s7, 4, v1
+  ; GCN9_4-NEXT:    v_mul_lo_u32 v0, v0, s6
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr1
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr6
+  ; GCN9_4-NEXT:    v_add_lshl_u32 v92, v0, v1, 1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v93, s0, v92
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    s_lshl_b32 s0, s7, 7
+  ; GCN9_4-NEXT:    v_add_lshl_u32 v95, v8, s0, 1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v8, 64, v93
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr8
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr92
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b128 v95, v[0:3]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b128 v95, v[4:7] offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
+  ; GCN9_4-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr88
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr84_vgpr85_vgpr86_vgpr87
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88 offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88 offset:1536
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN9_4-NEXT:    v_add_u32_e32 v72, 0x80, v93
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr72
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v94 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v94 offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v94 offset:1536
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88 offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88 offset:1536
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN9_4-NEXT:    v_add_u32_e32 v66, 0xc0, v93
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr64
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr67
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr70
+  ; GCN9_4-NEXT:    v_add_u32_e32 v68, v132, v64
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr66
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr71
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr68
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[72:75], v92, s[8:11], 0 offen offset:192 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[76:79], v66, s[8:11], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v66, v132, v67
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[64:65], v68, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr8
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[68:69], v66, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v66, v132, v70
+  ; GCN9_4-NEXT:    v_add_u32_e32 v70, v132, v71
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[66:67], v66, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[70:71], v70, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    ds_read_b128 v[80:83], v94
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[84:85], v[48:63]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[86:87], v[48:63]
+  ; GCN9_4-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[84:85], v[32:47]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[86:87], v[32:47]
+  ; GCN9_4-NEXT:    ds_read_b128 v[80:83], v94 offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[84:85], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31]
+  ; GCN9_4-NEXT:    ds_read_b128 v[80:83], v94 offset:1536
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[80:81], v[84:85], v[0:15]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[82:83], v[86:87], v[0:15]
+  ; GCN9_4-NEXT:    ds_read_b128 v[80:83], v88
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr84_vgpr85_vgpr86_vgpr87
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[84:85], v[48:63]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[86:87], v[48:63]
+  ; GCN9_4-NEXT:    ds_read_b128 v[80:83], v88 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[84:85], v[32:47]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[86:87], v[32:47]
+  ; GCN9_4-NEXT:    ds_read_b128 v[80:83], v88 offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[84:85], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31]
+  ; GCN9_4-NEXT:    ds_read_b128 v[80:83], v88 offset:1536
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b128 v95, v[72:75]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b128 v95, v[76:79] offset:1024
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v94 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v94 offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[80:81], v[84:85], v[0:15]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[82:83], v[86:87], v[0:15]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v94 offset:1536
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88 offset:1024
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN9_4-NEXT:    ds_read_b128 v[72:75], v88 offset:1536
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN9_4-NEXT:    v_perm_b32 v72, v68, v64, s5
+  ; GCN9_4-NEXT:    v_perm_b32 v64, v68, v64, s8
+  ; GCN9_4-NEXT:    v_perm_b32 v68, v69, v65, s5
+  ; GCN9_4-NEXT:    v_perm_b32 v73, v70, v66, s5
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN9_4-NEXT:    v_perm_b32 v74, v69, v65, s8
+  ; GCN9_4-NEXT:    v_perm_b32 v65, v70, v66, s8
+  ; GCN9_4-NEXT:    v_perm_b32 v69, v71, v67, s5
+  ; GCN9_4-NEXT:    v_perm_b32 v75, v71, v67, s8
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v66, s4, v48
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v49
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, s6, v67
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v50
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v51
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v52
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v53
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v54
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v55
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v56
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v57
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v58
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v59
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v60
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v61
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v62
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v63
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v32
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v33
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v34
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v35
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v36
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v37
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v38
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v39
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v40
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v41
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v42
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v43
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v44
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v45
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v46
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v47
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v16
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v17
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v18
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v19
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v20
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v21
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v22
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v23
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v24
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v25
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v26
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v27
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v28
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v29
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v30
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v31
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v0
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v1
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v2
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v3
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v4
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v5
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v6
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v7
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v8
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v9
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v10
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v11
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v67, s4, v12
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v13
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v67, v70
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr67
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr6
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v14
+  ; GCN9_4-NEXT:    v_add_u32_e32 v67, s7, v67
+  ; GCN9_4-NEXT:    v_and_b32_e32 v67, 0x1fffffff, v67
+  ; GCN9_4-NEXT:    v_mul_lo_u32 v67, v67, s6
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v71, s4, v15
+  ; GCN9_4-NEXT:    v_max3_f32 v66, v66, v70, v71
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr70
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr6_sgpr7
+  ; GCN9_4-NEXT:    v_add_lshl_u32 v135, v70, v67, 1
+  ; GCN9_4-NEXT:    ds_bpermute_b32 v67, v133, v66
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr70
+  ; GCN9_4-NEXT:    v_lshl_add_u32 v136, v70, 1, v135
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v135, v[72:73]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v136, v[64:65]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr64
+  ; GCN9_4-NEXT:    v_lshl_add_u32 v137, v64, 1, v136
+  ; GCN9_4-NEXT:    v_max_f32_e32 v65, v67, v67
+  ; GCN9_4-NEXT:    v_max_f32_e32 v65, v66, v65
+  ; GCN9_4-NEXT:    ds_bpermute_b32 v66, v133, v65
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr64
+  ; GCN9_4-NEXT:    v_lshl_add_u32 v138, v64, 1, v137
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v137, v[68:69]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr68
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v138, v[74:75]
+  ; GCN9_4-NEXT:    v_add_u32_e32 v68, v132, v68
+  ; GCN9_4-NEXT:    v_cndmask_b32_e64 v64, v66, v65, s[6:7]
+  ; GCN9_4-NEXT:    v_max_f32_e32 v64, v64, v64
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr65
+  ; GCN9_4-NEXT:    v_max_f32_e32 v66, v65, v65
+  ; GCN9_4-NEXT:    v_max_f32_e32 v134, v66, v64
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr64
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr66
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v64, v132, v64
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v64, v132, v66
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr67
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v64, v132, v67
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_fma_f32 v48, s4, v48, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v57, s4, v57, -v134
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN9_4-NEXT:    v_fma_f32 v64, s4, v49, -v134
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+  ; GCN9_4-NEXT:    v_fma_f32 v66, s4, v50, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v162, v57
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v49, v48
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
+  ; GCN9_4-NEXT:    v_fma_f32 v67, s4, v51, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v50, v48
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v66
+  ; GCN9_4-NEXT:    v_fma_f32 v68, s4, v52, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v51, v48
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v67
+  ; GCN9_4-NEXT:    v_fma_f32 v69, s4, v53, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v52, v48
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v68
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_fma_f32 v70, s4, v54, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v53, v48
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v69
+  ; GCN9_4-NEXT:    v_fma_f32 v71, s4, v55, -v134
+  ; GCN9_4-NEXT:    ds_read_b128 v[140:143], v139
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v54, v48
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v55, v48
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
+  ; GCN9_4-NEXT:    v_fma_f32 v66, s4, v56, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v56, v48
+  ; GCN9_4-NEXT:    v_sub_f32_e32 v48, v65, v134
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v64, v49
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v67, v50
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v68, v51
+  ; GCN9_4-NEXT:    v_fma_f32 v112, s4, v58, -v134
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v58, v52
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v48, v48
+  ; GCN9_4-NEXT:    ds_read_b128 v[144:147], v139 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_fma_f32 v160, s4, v59, -v134
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v59, v68, v58
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v58, v64, v67
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[140:141], v[58:59], v[96:111]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v66
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v161, v64
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v112
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95]
+  ; GCN9_4-NEXT:    v_fma_f32 v144, s4, v61, -v134
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v144
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v164, v144
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v140, v53
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v141, v54
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v61, v55
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GCN9_4-NEXT:    v_fma_f32 v145, s4, v62, -v134
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[148:149], v[58:59], v[64:79]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v148, v57
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v57, v56
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_fma_f32 v149, s4, v63, -v134
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v63, v61, v57
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v62, v140, v141
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v160
+  ; GCN9_4-NEXT:    v_fma_f32 v60, s4, v60, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v163, s4, v33, -v134
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v145
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v160, v161
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v144, v148
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v152, v57
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v153, 0x3fb8aa3b, v60
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr57
+  ; GCN9_4-NEXT:    ds_read_b128 v[58:61], v57
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_fma_f32 v32, s4, v32, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v40, s4, v40, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v44, s4, v44, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v16, s4, v16, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v24, s4, v24, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v28, s4, v28, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v0, s4, v0, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v8, s4, v8, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v12, s4, v12, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[142:143], v[62:63], v[96:111]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v153, v153
+  ; GCN9_4-NEXT:    ds_read_b128 v[140:143], v57 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[62:63], v[80:95]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v146, v162
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[150:151], v[62:63], v[64:79]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v151, v33
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v33, v152
+  ; GCN9_4-NEXT:    v_fma_f32 v150, s4, v34, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[62:63], v[112:127]
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v63, v144, v33
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v62, v160, v146
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v149
+  ; GCN9_4-NEXT:    v_fma_f32 v149, s4, v35, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v154, v33
+  ; GCN9_4-NEXT:    v_fma_f32 v160, s4, v36, -v134
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v163
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v155, v153
+  ; GCN9_4-NEXT:    v_fma_f32 v163, s4, v37, -v134
+  ; GCN9_4-NEXT:    v_perm_b32 v37, v130, v128, s8
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[58:59], v[62:63], v[96:111]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v32
+  ; GCN9_4-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v165, v58
+  ; GCN9_4-NEXT:    v_perm_b32 v59, v131, v129, s5
+  ; GCN9_4-NEXT:    v_perm_b32 v58, v159, v157, s5
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[62:63], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v166, v36
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v141, v151
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v140, v164
+  ; GCN9_4-NEXT:    v_perm_b32 v36, v158, v156, s8
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[32:33], v[62:63], v[64:79]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+  ; GCN9_4-NEXT:    v_fma_f32 v150, s4, v38, -v134
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v38, v154
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v167, v32
+  ; GCN9_4-NEXT:    v_perm_b32 v32, v158, v156, s5
+  ; GCN9_4-NEXT:    v_perm_b32 v33, v130, v128, s5
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v128, v155, v140
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b64 v135, v[32:33]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr33
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v136, v[36:37]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v137, v[58:59]
+  ; GCN9_4-NEXT:    v_add_u32_e32 v33, v132, v33
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr36
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr37
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v160
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[62:63], v[112:127]
+  ; GCN9_4-NEXT:    v_perm_b32 v63, v131, v129, s8
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v129, v141, v38
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v149
+  ; GCN9_4-NEXT:    v_perm_b32 v62, v159, v157, s8
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v155, v38
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr38
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v138, v[62:63]
+  ; GCN9_4-NEXT:    v_add_u32_e32 v38, v132, v38
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[130:131], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v33, v132, v36
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v33, v132, v37
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_fma_f32 v149, s4, v39, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[60:61], v[128:129], v[96:111]
+  ; GCN9_4-NEXT:    ds_read_b128 v[36:39], v139
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v156, v32
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v163
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v33, v165
+  ; GCN9_4-NEXT:    ds_read_b128 v[58:61], v139 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v157, v32
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v142, v166
+  ; GCN9_4-NEXT:    v_fma_f32 v143, s4, v41, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v150, s4, v42, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[34:35], v[128:129], v[64:79]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v34, v167
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v158, v32
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v32, v155
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v129, v34, v32
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v128, v33, v142
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v149
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v146, v32
+  ; GCN9_4-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_fma_f32 v142, s4, v43, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v149, s4, v46, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[36:37], v[128:129], v[96:111]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
+  ; GCN9_4-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v147, v36
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v37, v156
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[58:59], v[128:129], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v143, v36
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v58, v157
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
+  ; GCN9_4-NEXT:    v_fma_f32 v59, s4, v45, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[32:33], v[128:129], v[64:79]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v33, v158
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v150, v32
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v32, v146
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v33, v33, v32
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v32, v37, v58
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v58, v147
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v129, v36
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
+  ; GCN9_4-NEXT:    v_fma_f32 v128, s4, v47, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[38:39], v[32:33], v[96:111]
+  ; GCN9_4-NEXT:    ds_read_b128 v[36:39], v57
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v142, v40
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v59
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v59, v143
+  ; GCN9_4-NEXT:    ds_read_b128 v[44:47], v57 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[32:33], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v61, v40
+  ; GCN9_4-NEXT:    v_fma_f32 v60, s4, v17, -v134
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v149
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v40, v58, v59
+  ; GCN9_4-NEXT:    v_fma_f32 v149, s4, v18, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v58, s4, v20, -v134
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v60
+  ; GCN9_4-NEXT:    v_fma_f32 v60, s4, v21, -v134
+  ; GCN9_4-NEXT:    v_perm_b32 v21, v144, v140, s8
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[34:35], v[32:33], v[64:79]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v34, v150
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v159, v17
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v17, v129
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v41, v34, v17
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
+  ; GCN9_4-NEXT:    v_fma_f32 v42, s4, v19, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v128, v17
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v43, v142
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[36:37], v[40:41], v[96:111]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
+  ; GCN9_4-NEXT:    ds_read_b128 v[16:19], v57 offset:1152
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ds_read_b128 v[32:35], v57 offset:1728
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v160, v36
+  ; GCN9_4-NEXT:    v_perm_b32 v36, v131, v63, s8
+  ; GCN9_4-NEXT:    v_perm_b32 v37, v145, v141, s8
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v163, v20
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v45, v159
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v44, v61
+  ; GCN9_4-NEXT:    v_perm_b32 v20, v130, v62, s8
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[40:41], v[64:79]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v149
+  ; GCN9_4-NEXT:    v_fma_f32 v149, s4, v22, -v134
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v22, v128
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v168, v16
+  ; GCN9_4-NEXT:    v_perm_b32 v16, v130, v62, s5
+  ; GCN9_4-NEXT:    v_perm_b32 v17, v144, v140, s5
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b64 v135, v[16:17]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v136, v[20:21]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr17
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr20
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr21
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v58
+  ; GCN9_4-NEXT:    v_add_u32_e32 v17, v132, v17
+  ; GCN9_4-NEXT:    v_add_u32_e32 v20, v132, v20
+  ; GCN9_4-NEXT:    v_add_u32_e32 v21, v132, v21
+  ; GCN9_4-NEXT:    v_fma_f32 v62, s4, v23, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v41, v45, v22
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v42
+  ; GCN9_4-NEXT:    v_perm_b32 v32, v131, v63, s5
+  ; GCN9_4-NEXT:    v_perm_b32 v33, v145, v141, s5
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v63, v22
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr22
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v137, v[32:33]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v138, v[36:37]
+  ; GCN9_4-NEXT:    v_add_u32_e32 v22, v132, v22
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v40, v43, v44
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[32:33], v22, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    ds_read_b128 v[20:23], v139
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[38:39], v[40:41], v[96:111]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v130, v16
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v60
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v17, v160
+  ; GCN9_4-NEXT:    ds_read_b128 v[36:39], v139 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_fma_f32 v131, s4, v26, -v134
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[40:41], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v60, v16
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v149
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v46, v163
+  ; GCN9_4-NEXT:    v_fma_f32 v47, s4, v25, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[40:41], v[64:79]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v132, v16
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v18, v168
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v16, v63
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[40:41], v[112:127]
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v35, v18, v16
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v34, v17, v46
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v41, v16
+  ; GCN9_4-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_fma_f32 v40, s4, v27, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v62, s4, v30, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[20:21], v[34:35], v[96:111]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v24
+  ; GCN9_4-NEXT:    ds_read_b128 v[24:27], v139 offset:1728
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v46, v20
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v47
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v21, v130
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[34:35], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v47, v20
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v36, v60
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v40
+  ; GCN9_4-NEXT:    v_fma_f32 v37, s4, v29, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[34:35], v[64:79]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v131, v16
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v17, v132
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v16, v41
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v17, v17, v16
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v16, v21, v36
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v36, v46
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[34:35], v[112:127]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v35, v20
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
+  ; GCN9_4-NEXT:    v_fma_f32 v34, s4, v31, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[22:23], v[16:17], v[96:111]
+  ; GCN9_4-NEXT:    ds_read_b128 v[20:23], v57
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v40, v24
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v37, v47
+  ; GCN9_4-NEXT:    ds_read_b128 v[28:31], v57 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
+  ; GCN9_4-NEXT:    v_fma_f32 v38, s4, v1, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v39, v24
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v62
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v24, v36, v37
+  ; GCN9_4-NEXT:    v_fma_f32 v62, s4, v2, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v37, s4, v6, -v134
+  ; GCN9_4-NEXT:    v_perm_b32 v6, v42, v32, s8
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[16:17], v[64:79]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v140, v1
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v18, v131
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v1, v35
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v25, v18, v1
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v34
+  ; GCN9_4-NEXT:    v_fma_f32 v34, s4, v4, -v134
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v38
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
+  ; GCN9_4-NEXT:    v_fma_f32 v26, s4, v3, -v134
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v27, v1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[20:21], v[24:25], v[96:111]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
+  ; GCN9_4-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ds_read_b128 v[16:19], v57 offset:1728
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v36, v20
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v21, v40
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v20, v39
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v29, v4
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v26
+  ; GCN9_4-NEXT:    v_fma_f32 v28, s4, v5, -v134
+  ; GCN9_4-NEXT:    v_perm_b32 v5, v58, v44, s5
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v26, v36
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[24:25], v[64:79]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v62
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v38, v0
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v1, v140
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v0, v27
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v0, v21, v20
+  ; GCN9_4-NEXT:    v_perm_b32 v20, v43, v33, s8
+  ; GCN9_4-NEXT:    v_perm_b32 v21, v59, v45, s8
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v25, v4
+  ; GCN9_4-NEXT:    v_fma_f32 v24, s4, v7, -v134
+  ; GCN9_4-NEXT:    v_perm_b32 v4, v42, v32, s5
+  ; GCN9_4-NEXT:    v_perm_b32 v16, v43, v33, s5
+  ; GCN9_4-NEXT:    v_perm_b32 v7, v58, v44, s8
+  ; GCN9_4-NEXT:    v_perm_b32 v17, v59, v45, s5
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b64 v135, v[4:5]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v136, v[6:7]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v137, v[16:17]
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b64 v138, v[20:21]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v34
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_read_b128 v[4:7], v139
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_fma_f32 v33, s4, v10, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[22:23], v[0:1], v[96:111]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v32, v16
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v28
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v28, v29
+  ; GCN9_4-NEXT:    ds_read_b128 v[20:23], v139 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[0:1], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v31, v16
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v16, v26, v28
+  ; GCN9_4-NEXT:    v_fma_f32 v30, s4, v9, -v134
+  ; GCN9_4-NEXT:    v_fma_f32 v28, s4, v14, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[0:1], v[64:79]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v37
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v34, v2
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v3, v38
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[0:1], v[112:127]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v0, v25
+  ; GCN9_4-NEXT:    v_fma_f32 v18, s4, v11, -v134
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v17, v3, v0
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v24
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v19, v0
+  ; GCN9_4-NEXT:    ds_read_b128 v[0:3], v139 offset:1152
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[4:5], v[16:17], v[96:111]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v8
+  ; GCN9_4-NEXT:    ds_read_b128 v[8:11], v139 offset:1728
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v24, v4
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v5, v32
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v30
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v26, v4
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v18, v24
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v20, v31
+  ; GCN9_4-NEXT:    v_fma_f32 v21, s4, v13, -v134
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[16:17], v[64:79]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v33
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v1, v34
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v30, v0
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v0, v19
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v0, v5, v20
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v17, v4
+  ; GCN9_4-NEXT:    v_fma_f32 v16, s4, v15, -v134
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v12
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v20, v8
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v21
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v21, v26
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[6:7], v[0:1], v[96:111]
+  ; GCN9_4-NEXT:    ds_read_b128 v[4:7], v57
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ds_read_b128 v[12:15], v57 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v22, v8
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v8, v18, v21
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[0:1], v[64:79]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v3, v30
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v28
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v23, v2
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v2, v22
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v0, v17
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v1, v23
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v9, v3, v0
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v16
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v10, v0
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v3, v20
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v0, v10
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[4:5], v[8:9], v[96:111]
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v5, v1, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, 0, v49
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v50, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v51, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v52, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v53, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v54, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v55, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v56, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v161, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v162, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v148, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v152, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v153, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v164, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v151, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v154, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v165, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v166, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v167, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v155, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v156, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v157, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v158, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v146, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v147, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v143, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v150, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v129, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v142, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v61, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v159, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v128, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v160, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v163, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v168, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v63, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v130, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v60, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v132, v0
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v4, v3, v2
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v41, v0
+  ; GCN9_4-NEXT:    s_nop 0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[6:7], v[4:5], v[96:111]
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v46, v0
+  ; GCN9_4-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v47, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v131, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v35, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v40, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v39, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v140, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v27, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v36, v6
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[8:9], v[64:79]
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v29, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v38, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v25, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v32, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v31, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v34, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v19, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v24, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v26, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v30, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v17, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v20, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v22, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v23, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v10, v0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[4:5], v[64:79]
+  ; GCN9_4-NEXT:    ds_bpermute_b32 v1, v133, v0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[4:5], v[80:95]
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    v_add_f32_e32 v4, v0, v1
+  ; GCN9_4-NEXT:    ds_bpermute_b32 v5, v133, v4
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr0
+  ; GCN9_4-NEXT:    v_cndmask_b32_e64 v1, v5, v4, s[6:7]
+  ; GCN9_4-NEXT:    v_fmac_f32_e32 v1, v0, v48
+  ; GCN9_4-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
   !0 = !{i64 2862105}
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0473e017f193cb..5497d859c97dc6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN9_4 %s
 
 --- |
   define amdgpu_kernel void @smallInterleave() #0 { ret void }
@@ -488,6 +489,501 @@
   ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
   ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
   ; GCN-NEXT:    s_endpgm
+  ;
+  ; GCN9_4-LABEL: smallInterleave:
+  ; GCN9_4:       ; %bb.0:
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr2
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN9_4-NEXT:    v_readfirstlane_b32 s20, v2
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr0
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr3
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr0_vgpr1
+  ; GCN9_4-NEXT:    s_lshl_b32 s4, s20, 7
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr6
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr7
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr49
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr48
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr51
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr68
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr69
+  ; GCN9_4-NEXT:    v_max_f32_e32 v79, v69, v69
+  ; GCN9_4-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
+  ; GCN9_4-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s0, v2, v[0:1]
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr5
+  ; GCN9_4-NEXT:    v_add_lshl_u32 v50, v5, s4, 1
+  ; GCN9_4-NEXT:    v_add_u32_e32 v6, v6, v49
+  ; GCN9_4-NEXT:    v_add_u32_e32 v7, v7, v49
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr7
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr6
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr4
+  ; GCN9_4-NEXT:    ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    s_nop 1
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
+  ; GCN9_4-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b128 v50, v[0:3]
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[64:65], v6, s[16:19], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[66:67], v7, s[16:19], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    ds_read_b128 v[36:39], v48
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ds_read_b128 v[44:47], v48 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr6
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr5
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr2
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr3
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr0_sgpr1
+  ; GCN9_4-NEXT:    v_perm_b32 v80, v66, v64, s2
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
+  ; GCN9_4-NEXT:    v_perm_b32 v64, v66, v64, s3
+  ; GCN9_4-NEXT:    v_perm_b32 v66, v67, v65, s2
+  ; GCN9_4-NEXT:    v_perm_b32 v65, v67, v65, s3
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+  ; GCN9_4-NEXT:    ds_read_b128 v[36:39], v51
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+  ; GCN9_4-NEXT:    ds_read_b128 v[44:47], v51 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b128 v50, v[32:35]
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_read_b128 v[32:35], v48
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr44
+  ; GCN9_4-NEXT:    v_add_u32_e32 v86, v44, v49
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr86
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+  ; GCN9_4-NEXT:    ds_read_b128 v[40:43], v48 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
+  ; GCN9_4-NEXT:    ds_read_b128 v[32:35], v51
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15]
+  ; GCN9_4-NEXT:    ds_read_b128 v[40:43], v51 offset:512
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr32
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr33
+  ; GCN9_4-NEXT:    v_add_u32_e32 v32, s20, v32
+  ; GCN9_4-NEXT:    v_and_b32_e32 v32, 0x1fffffff, v32
+  ; GCN9_4-NEXT:    v_mul_lo_u32 v32, v32, s5
+  ; GCN9_4-NEXT:    v_add_lshl_u32 v81, v33, v32, 1
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b32 v81, v80
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr36
+  ; GCN9_4-NEXT:    v_lshl_add_u32 v82, v36, 1, v81
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b32 v82, v64
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr37
+  ; GCN9_4-NEXT:    v_lshl_add_u32 v83, v37, 1, v82
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr40
+  ; GCN9_4-NEXT:    v_lshl_add_u32 v84, v40, 1, v83
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr41
+  ; GCN9_4-NEXT:    v_add_u32_e32 v85, v41, v49
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b32 v83, v66
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b32 v84, v65
+  ; GCN9_4-NEXT:    ; kill: killed $vgpr85
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15]
+  ; GCN9_4-NEXT:    s_nop 7
+  ; GCN9_4-NEXT:    s_nop 7
+  ; GCN9_4-NEXT:    s_nop 1
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v34, s4, v16
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v35, s4, v17
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v45, s4, v18
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v46, s4, v19
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, s6, v35
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v47, s4, v20
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v48, s4, v21
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v45, v46
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v50, s4, v22
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v51, s4, v23
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v47, v48
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v52, s4, v24
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v53, s4, v25
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v50, v51
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v54, s4, v26
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v55, s4, v27
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v52, v53
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v56, s4, v28
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v57, s4, v29
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v54, v55
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v58, s4, v30
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v59, s4, v31
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v56, v57
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v35, s4, v0
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v38, s4, v1
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v58, v59
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v39, s4, v2
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v42, s4, v3
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v35, v38
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v43, s4, v4
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v45, s4, v5
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v39, v42
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v46, s4, v6
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v47, s4, v7
+  ; GCN9_4-NEXT:    v_max3_f32 v34, v34, v43, v45
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v70, s4, v8
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v71, s4, v9
+  ; GCN9_4-NEXT:    v_max3_f32 v78, v34, v46, v47
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v72, s4, v10
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v73, s4, v11
+  ; GCN9_4-NEXT:    v_max3_f32 v70, v78, v70, v71
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v74, s4, v12
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v75, s4, v13
+  ; GCN9_4-NEXT:    v_max3_f32 v70, v70, v72, v73
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v76, s4, v14
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v77, s4, v15
+  ; GCN9_4-NEXT:    v_max3_f32 v70, v70, v74, v75
+  ; GCN9_4-NEXT:    v_max3_f32 v70, v70, v76, v77
+  ; GCN9_4-NEXT:    ds_bpermute_b32 v71, v68, v70
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    v_max_f32_e32 v64, v71, v71
+  ; GCN9_4-NEXT:    v_max_f32_e32 v70, v70, v64
+  ; GCN9_4-NEXT:    ds_bpermute_b32 v71, v68, v70
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[64:65], v85, s[16:19], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    buffer_load_dwordx2 v[66:67], v86, s[16:19], 0 offen sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
+  ; GCN9_4-NEXT:    v_max_f32_e32 v70, v70, v70
+  ; GCN9_4-NEXT:    v_max_f32_e32 v72, v79, v70
+  ; GCN9_4-NEXT:    v_fma_f32 v16, s4, v16, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v18, s4, v18, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v19, s4, v19, -v72
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
+  ; GCN9_4-NEXT:    v_fma_f32 v17, s4, v17, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v20, s4, v20, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v21, s4, v21, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v22, s4, v22, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v23, s4, v23, -v72
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v73, v16
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v74, v18
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v75, v19
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v76, v20
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v16, v73
+  ; GCN9_4-NEXT:    v_fma_f32 v18, s4, v24, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v77, v21
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v21, v74
+  ; GCN9_4-NEXT:    v_fma_f32 v20, s4, v25, -v72
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v78, v22
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v22, v75
+  ; GCN9_4-NEXT:    v_fma_f32 v26, s4, v26, -v72
+  ; GCN9_4-NEXT:    v_sub_f32_e32 v24, v69, v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v69, v23
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
+  ; GCN9_4-NEXT:    v_fma_f32 v27, s4, v27, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v85, v23
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v17, v17
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v71, v21, v22
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr79
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v19, v17
+  ; GCN9_4-NEXT:    v_fma_f32 v28, s4, v28, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v29, s4, v29, -v72
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v70, v16, v19
+  ; GCN9_4-NEXT:    ds_read_b128 v[18:21], v79
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v16, v24
+  ; GCN9_4-NEXT:    s_nop 0
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+  ; GCN9_4-NEXT:    v_add_f32_e32 v18, 0, v73
+  ; GCN9_4-NEXT:    v_fma_f32 v31, s4, v31, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v73, v22
+  ; GCN9_4-NEXT:    ds_read_b128 v[22:25], v79 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v19, v76
+  ; GCN9_4-NEXT:    v_fma_f32 v0, s4, v0, -v72
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
+  ; GCN9_4-NEXT:    v_add_f32_e32 v17, v17, v18
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v80, v77
+  ; GCN9_4-NEXT:    v_fma_f32 v23, s4, v30, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v26, v18
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v27
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v22, v78
+  ; GCN9_4-NEXT:    v_fma_f32 v1, s4, v1, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v27, v18
+  ; GCN9_4-NEXT:    v_add_f32_e32 v17, v74, v17
+  ; GCN9_4-NEXT:    v_fma_f32 v4, s4, v4, -v72
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v30, v69
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v18, v19, v80
+  ; GCN9_4-NEXT:    v_fma_f32 v5, s4, v5, -v72
+  ; GCN9_4-NEXT:    ; implicit-def: $vgpr70
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v19, v22, v30
+  ; GCN9_4-NEXT:    s_nop 1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v28
+  ; GCN9_4-NEXT:    v_add_f32_e32 v17, v75, v17
+  ; GCN9_4-NEXT:    v_fma_f32 v6, s4, v6, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v28, v20
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v29
+  ; GCN9_4-NEXT:    v_fma_f32 v7, s4, v7, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v30, v20
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v23
+  ; GCN9_4-NEXT:    v_add_f32_e32 v17, v76, v17
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v22, v73
+  ; GCN9_4-NEXT:    v_fma_f32 v24, s4, v2, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v25, v18
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v29, v85
+  ; GCN9_4-NEXT:    ds_read_b128 v[18:21], v70
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v22, v22, v29
+  ; GCN9_4-NEXT:    v_fma_f32 v29, s4, v3, -v72
+  ; GCN9_4-NEXT:    v_add_f32_e32 v17, v77, v17
+  ; GCN9_4-NEXT:    v_fma_f32 v10, s4, v10, -v72
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v71, v30
+  ; GCN9_4-NEXT:    v_add_f32_e32 v17, v78, v17
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v23, v26
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v2, v27
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v23, v23, v2
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v31
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v31, v2
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[22:23], v[48:63]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v18, v28
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v19, v0
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v1
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v74, v0
+  ; GCN9_4-NEXT:    ds_read_b128 v[0:3], v70 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[22:23], v[32:47]
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v24
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v1, v25
+  ; GCN9_4-NEXT:    v_fma_f32 v24, s4, v9, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v22, v0
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v0, v31
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v0, v18, v71
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v29
+  ; GCN9_4-NEXT:    s_nop 0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[0:1], v[48:63]
+  ; GCN9_4-NEXT:    v_fma_f32 v21, s4, v8, -v72
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v18, v18
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v20, v19
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v23, v4
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v5
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v5, v74
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v29, v4
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[0:1], v[32:47]
+  ; GCN9_4-NEXT:    v_perm_b32 v0, v66, v64, s2
+  ; GCN9_4-NEXT:    v_perm_b32 v1, v66, v64, s3
+  ; GCN9_4-NEXT:    v_perm_b32 v2, v67, v65, s2
+  ; GCN9_4-NEXT:    v_perm_b32 v3, v67, v65, s3
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    ds_write_b32 v81, v0
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b32 v82, v1
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b32 v83, v2
+  ; GCN9_4-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_write_b32 v84, v3
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v6
+  ; GCN9_4-NEXT:    ;;#ASMSTART
+  ; GCN9_4-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN9_4-NEXT:    ;;#ASMEND
+  ; GCN9_4-NEXT:    v_add_f32_e32 v4, v69, v17
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v6, v22
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v17, v0
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    ds_read_b128 v[0:3], v79
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v8, v18
+  ; GCN9_4-NEXT:    ; implicit-def: $sgpr2
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v9, v6, v8
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v8, v20, v5
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v5, 0x3fb8aa3b, v7
+  ; GCN9_4-NEXT:    s_nop 0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v73, v4
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v20, v5
+  ; GCN9_4-NEXT:    ds_read_b128 v[4:7], v79 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v21
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN9_4-NEXT:    v_add_f32_e32 v4, v85, v0
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v8, v0
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v64, v1
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v24
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v24, v29
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v65, v1
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v21, v23
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v1, v17
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v0, v20
+  ; GCN9_4-NEXT:    v_fma_f32 v9, s4, v15, -v72
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v9, v9
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v0, v21, v24
+  ; GCN9_4-NEXT:    s_nop 1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN9_4-NEXT:    v_add_f32_e32 v2, v26, v4
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v4, v64
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v27, v2
+  ; GCN9_4-NEXT:    v_fma_f32 v11, s4, v11, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v12, s4, v12, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v5, s4, v14, -v72
+  ; GCN9_4-NEXT:    v_fma_f32 v13, s4, v13, -v72
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v10, 0x3fb8aa3b, v11
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v11, v3
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+  ; GCN9_4-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v5
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v10, v10
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v13, v3
+  ; GCN9_4-NEXT:    v_exp_f32_e32 v7, v0
+  ; GCN9_4-NEXT:    ds_read_b128 v[0:3], v70
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v12, v65
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v5, v8
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v14, v10
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v4, v4, v12
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v12, v13
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v5, v5, v14
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v14, v11
+  ; GCN9_4-NEXT:    s_nop 0
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v0, v9
+  ; GCN9_4-NEXT:    v_cvt_f16_f32_e32 v1, v7
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN9_4-NEXT:    v_pack_b32_f16 v0, v14, v12
+  ; GCN9_4-NEXT:    s_nop 1
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v28, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v30, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v25, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v31, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v19, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v74, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v22, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v18, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v23, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v29, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v17, v0
+  ; GCN9_4-NEXT:    v_add_f32_e32 v6, v20, v0
+  ; GCN9_4-NEXT:    ds_read_b128 v[0:3], v70 offset:576
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    buffer_inv sc0 sc1
+  ; GCN9_4-NEXT:    v_add_f32_e32 v2, v64, v6
+  ; GCN9_4-NEXT:    v_add_f32_e32 v2, v65, v2
+  ; GCN9_4-NEXT:    v_add_f32_e32 v2, v8, v2
+  ; GCN9_4-NEXT:    v_add_f32_e32 v2, v10, v2
+  ; GCN9_4-NEXT:    v_add_f32_e32 v2, v11, v2
+  ; GCN9_4-NEXT:    v_add_f32_e32 v2, v13, v2
+  ; GCN9_4-NEXT:    v_add_f32_e32 v2, v7, v2
+  ; GCN9_4-NEXT:    v_add_f32_e32 v2, v9, v2
+  ; GCN9_4-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[4:5], v[32:47]
+  ; GCN9_4-NEXT:    ds_bpermute_b32 v0, v68, v2
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    v_add_f32_e32 v0, v2, v0
+  ; GCN9_4-NEXT:    ds_bpermute_b32 v1, v68, v0
+  ; GCN9_4-NEXT:    v_mov_b32_e32 v2, 0
+  ; GCN9_4-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN9_4-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[0:1]
+  ; GCN9_4-NEXT:    v_fmac_f32_e32 v0, v2, v16
+  ; GCN9_4-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
 
   !0 = !{i64 2862105}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
index a3d789c1ccc36f..daec7e9b91e71e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
 
 ; DPP control value 337 is valid for 64-bit DPP on gfx942
 
diff --git a/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll b/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll
index 5201f188afd5f8..df717d0ae497d3 100644
--- a/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 %s -o - | FileCheck %s
 
 define void @no_corresponding_integer_type(i8 %arg, ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: no_corresponding_integer_type:
diff --git a/llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s b/llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s
new file mode 100644
index 00000000000000..1200fcb57969b0
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s
@@ -0,0 +1,56 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx9-4-generic -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx9-4-generic -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4xf32 a[0:15], v[2:3], v[4:5], a[18:33]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf8 v1, 3
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf8_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf8_e64 v5, v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf8_sdwa v5, v1 src0_sel:BYTE_0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_fp8 v1, 3
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_fp8_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_fp8_e64 v5, v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_fp8_sdwa v5, v1 src0_sel:BYTE_0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_fp8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_fp8_dpp v[10:11], v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_fp8_sdwa v[10:11], v1 src0_sel:WORD_0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_bf8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_bf8_dpp v[10:11], v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_bf8_sdwa v[10:11], v1 src0_sel:WORD_0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
index 37234dba7d9b4c..9c79ea588f6247 100644
--- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -250,6 +250,10 @@
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX9_GENERIC %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX9_GENERIC %s
 
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX9_4_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX9_4_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX9_4_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX9_4_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX9_4_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX9_4_GENERIC %s
+
 # RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX10_1_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX10_1_GENERIC
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX10_1_GENERIC %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX10_1_GENERIC %s
@@ -473,6 +477,9 @@
 # ELF-AMDGCN-GFX9_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
 # YAML-AMDGCN-GFX9_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC ]
 
+# ELF-AMDGCN-GFX9_4_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC (0x5F)
+# YAML-AMDGCN-GFX9_4_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC ]
+
 # ELF-AMDGCN-GFX10_1_GENERIC:   EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52)
 # YAML-AMDGCN-GFX10_1_GENERIC:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC ]
 
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
index c38f6b4e7833cd..45071ecb751321 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
@@ -137,6 +137,12 @@ define amdgpu_kernel void @test_kernel() {
 
 ; ----------------------------------GFX9---------------------------------------
 ;
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-4-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-4-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-generic -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-generic %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
index 78acbd657b7635..34c22dca3aa183 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
@@ -364,6 +364,9 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC -DFLAG_VALUE=0x51
 
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC -DFLAG_VALUE=0x5F
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100 -DFLAG_VALUE=0x41
 
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 7d92a492d8b181..1012cd020d525e 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1641,6 +1641,7 @@ const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1200, "gfx1200"),                          \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1201, "gfx1201"),                          \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, "gfx9-generic"),                \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, "gfx9-4-generic"),            \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, "gfx10-1-generic"),          \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, "gfx10-3-generic"),          \
   ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, "gfx11-generic"),              \



More information about the llvm-commits mailing list