[llvm-branch-commits] [clang] [llvm] [AMDGPU] Introduce a new generic target `gfx9-4-generic` (PR #115190)
Shilei Tian via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Nov 7 09:17:20 PST 2024
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/115190
>From ca91f78927247f982bd55414fd83129d9c2babee Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 6 Nov 2024 12:49:45 -0500
Subject: [PATCH] [AMDGPU] Introduce a new generic target `gfx9-4-generic`
---
clang/include/clang/Basic/Cuda.h | 1 +
clang/lib/Basic/Cuda.cpp | 1 +
clang/lib/Basic/Targets/NVPTX.cpp | 1 +
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 1 +
clang/test/CodeGenOpenCL/amdgpu-features.cl | 3 +
clang/test/Driver/amdgpu-macros.cl | 1 +
clang/test/Driver/amdgpu-mcpu.cl | 2 +
.../Misc/target-invalid-cpu-note/amdgcn.c | 1 +
.../test/Misc/target-invalid-cpu-note/nvptx.c | 1 +
llvm/docs/AMDGPUUsage.rst | 6 +
llvm/include/llvm/BinaryFormat/ELF.h | 3 +-
llvm/include/llvm/TargetParser/TargetParser.h | 3 +-
llvm/lib/Object/ELFObjectFile.cpp | 2 +
llvm/lib/ObjectYAML/ELFYAML.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPU.td | 31 +-
llvm/lib/Target/AMDGPU/GCNProcessors.td | 5 +
.../MCTargetDesc/AMDGPUTargetStreamer.cpp | 5 +
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 1 +
llvm/lib/TargetParser/TargetParser.cpp | 9 +-
.../CodeGen/AMDGPU/directive-amdgcn-target.ll | 4 +
.../CodeGen/AMDGPU/div-rem-by-constant-64.ll | 443 +++++++
llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir | 1 +
.../CodeGen/AMDGPU/elf-header-flags-mach.ll | 2 +
.../AMDGPU/generic-targets-require-v6.ll | 3 +
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 1144 +++++++++++++++++
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 496 +++++++
.../AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll | 1 +
.../AMDGPU/no-corresponding-integer-type.ll | 1 +
.../MC/AMDGPU/gfx9_4_generic_unsupported.s | 104 ++
.../Object/AMDGPU/elf-header-flags-mach.yaml | 7 +
.../llvm-objdump/ELF/AMDGPU/subtarget.ll | 6 +
.../llvm-readobj/ELF/AMDGPU/elf-headers.test | 3 +
llvm/tools/llvm-readobj/ELFDumper.cpp | 1 +
33 files changed, 2284 insertions(+), 10 deletions(-)
create mode 100644 llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 7b4f435dc39f29..721e8981af6ffc 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -103,6 +103,7 @@ enum class OffloadArch {
GFX909,
GFX90a,
GFX90c,
+ GFX9_4_GENERIC,
GFX940,
GFX941,
GFX942,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index d765baef913e2f..59c932468cd891 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -121,6 +121,7 @@ static const OffloadArchToStringMap arch_names[] = {
GFX(909), // gfx909
GFX(90a), // gfx90a
GFX(90c), // gfx90c
+ {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"},
GFX(940), // gfx940
GFX(941), // gfx941
GFX(942), // gfx942
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index e0bd0b096324d8..0897032c4b8546 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -205,6 +205,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
case OffloadArch::GFX909:
case OffloadArch::GFX90a:
case OffloadArch::GFX90c:
+ case OffloadArch::GFX9_4_GENERIC:
case OffloadArch::GFX940:
case OffloadArch::GFX941:
case OffloadArch::GFX942:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 598b946ad88dbb..43dc0e62284602 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2301,6 +2301,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
case OffloadArch::GFX909:
case OffloadArch::GFX90a:
case OffloadArch::GFX90c:
+ case OffloadArch::GFX9_4_GENERIC:
case OffloadArch::GFX940:
case OffloadArch::GFX941:
case OffloadArch::GFX942:
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index f3473346baae5a..692f5103724342 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -56,6 +56,8 @@
// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103-W64 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx9-4-generic -emit-llvm -o - %s | FileCheck --check-prefix=GFX9_4_Generic %s
+
// NOCPU-NOT: "target-features"
// NOCPU-WAVE32: "target-features"="+wavefrontsize32"
// NOCPU-WAVE64: "target-features"="+wavefrontsize64"
@@ -85,6 +87,7 @@
// GFX940: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl
index 2fedd10bb53445..92abed03a5cebd 100644
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -133,6 +133,7 @@
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1201 -DFAMILY=GFX12
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_generic -DFAMILY=GFX9
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx9-4-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx9_generic -DFAMILY=GFX9
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx10-1-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_1_generic -DFAMILY=GFX10
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx10-3-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx10_3_generic -DFAMILY=GFX10
// RUN: %clang -E -dM -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx11_generic -DFAMILY=GFX11
diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl
index 42ce33db6eec07..ba578435072985 100644
--- a/clang/test/Driver/amdgpu-mcpu.cl
+++ b/clang/test/Driver/amdgpu-mcpu.cl
@@ -118,6 +118,7 @@
// RUN: %clang -### -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefix=GFX1201 %s
// RUN: %clang -### -target amdgcn -mcpu=gfx9-generic %s 2>&1 | FileCheck --check-prefix=GFX9_GENERIC %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx9-4-generic %s 2>&1 | FileCheck --check-prefix=GFX9_4_GENERIC %s
// RUN: %clang -### -target amdgcn -mcpu=gfx10-1-generic %s 2>&1 | FileCheck --check-prefix=GFX10_1_GENERIC %s
// RUN: %clang -### -target amdgcn -mcpu=gfx10-3-generic %s 2>&1 | FileCheck --check-prefix=GFX10_3_GENERIC %s
// RUN: %clang -### -target amdgcn -mcpu=gfx11-generic %s 2>&1 | FileCheck --check-prefix=GFX11_GENERIC %s
@@ -172,6 +173,7 @@
// GFX1201: "-target-cpu" "gfx1201"
// GFX9_GENERIC: "-target-cpu" "gfx9-generic"
+// GFX9_4_GENERIC: "-target-cpu" "gfx9-4-generic"
// GFX10_1_GENERIC: "-target-cpu" "gfx10-1-generic"
// GFX10_3_GENERIC: "-target-cpu" "gfx10-3-generic"
// GFX11_GENERIC: "-target-cpu" "gfx11-generic"
diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
index b3ddbd53a0391b..cae31797c1febe 100644
--- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
+++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
@@ -70,6 +70,7 @@
// CHECK-SAME: {{^}}, gfx1200
// CHECK-SAME: {{^}}, gfx1201
// CHECK-SAME: {{^}}, gfx9-generic
+// CHECK-SAME: {{^}}, gfx9-4-generic
// CHECK-SAME: {{^}}, gfx10-1-generic
// CHECK-SAME: {{^}}, gfx10-3-generic
// CHECK-SAME: {{^}}, gfx11-generic
diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
index a59e1c6fab1c49..44fe07065b2428 100644
--- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c
+++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
@@ -50,6 +50,7 @@
// CHECK-SAME: {{^}}, gfx909
// CHECK-SAME: {{^}}, gfx90a
// CHECK-SAME: {{^}}, gfx90c
+// CHECK-SAME: {{^}}, gfx9-4-generic
// CHECK-SAME: {{^}}, gfx940
// CHECK-SAME: {{^}}, gfx941
// CHECK-SAME: {{^}}, gfx942
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5b83ea428c0bff..b9eb71e73dad97 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -576,6 +576,12 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor
- ``v_dot2_f32_f16``
+ ``gfx9-4-generic`` ``amdgcn`` - ``gfx940`` - xnack - Absolute flat FP8 and BF8 instructions,
+ - ``gfx941`` scratch FP8 and BF8 conversion instructions,
+ - ``gfx942`` as well as instructions with XF32 format support
+ are not available.
+
+
``gfx10-1-generic`` ``amdgcn`` - ``gfx1010`` - xnack - Absolute flat - The following instructions are
- ``gfx1011`` - wavefrontsize64 scratch not available on ``gfx1011``
- ``gfx1012`` - cumode and ``gfx1012``
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index c591a96232f115..6c05ea7208e1f1 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -822,11 +822,12 @@ enum : unsigned {
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057,
EF_AMDGPU_MACH_AMDGCN_GFX1153 = 0x058,
EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC = 0x059,
+ EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC = 0x05f,
// clang-format on
// First/last AMDGCN-based processors.
EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
- EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC,
+ EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC,
// Indicates if the "xnack" target feature is enabled for all code contained
// in the object.
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index ae86ff39083d89..c6db4dfd7f5159 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -119,9 +119,10 @@ enum GPUKind : uint32_t {
GK_GFX10_3_GENERIC = 194,
GK_GFX11_GENERIC = 195,
GK_GFX12_GENERIC = 196,
+ GK_GFX9_4_GENERIC = 197,
GK_AMDGCN_GENERIC_FIRST = GK_GFX9_GENERIC,
- GK_AMDGCN_GENERIC_LAST = GK_GFX12_GENERIC,
+ GK_AMDGCN_GENERIC_LAST = GK_GFX9_4_GENERIC,
};
/// Instruction set architecture version.
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 5096877d2a4b00..9dc39936ffd8bb 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -602,6 +602,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
// Generic AMDGCN targets
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC:
return "gfx9-generic";
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC:
+ return "gfx9-4-generic";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC:
return "gfx10-1-generic";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC:
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index e97248cbcf5682..76c6c496c5e6bc 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -631,6 +631,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH);
+ BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, EF_AMDGPU_MACH);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index bde61a1f7e58df..c8ae010414dc40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1451,11 +1451,7 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureDPALU_DPP,
FeaturePackedFP32Ops,
FeatureMAIInsts,
- FeatureFP8Insts,
- FeatureFP8ConversionInsts,
- FeatureCvtFP8VOP1Bug,
FeaturePkFmacF16Inst,
- FeatureXF32Insts,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
FeatureAtomicBufferGlobalPkAddF16Insts,
@@ -1476,15 +1472,36 @@ def FeatureISAVersion9_4_Common : FeatureSet<
def FeatureISAVersion9_4_0 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
- [FeatureForceStoreSC0SC1])>;
+ [
+ FeatureForceStoreSC0SC1,
+ FeatureFP8Insts,
+ FeatureFP8ConversionInsts,
+ FeatureCvtFP8VOP1Bug,
+ FeatureXF32Insts
+ ])>;
def FeatureISAVersion9_4_1 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
- [FeatureForceStoreSC0SC1])>;
+ [
+ FeatureForceStoreSC0SC1,
+ FeatureFP8Insts,
+ FeatureFP8ConversionInsts,
+ FeatureCvtFP8VOP1Bug,
+ FeatureXF32Insts
+ ])>;
def FeatureISAVersion9_4_2 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
- [])>;
+ [
+ FeatureFP8Insts,
+ FeatureFP8ConversionInsts,
+ FeatureCvtFP8VOP1Bug,
+ FeatureXF32Insts
+ ])>;
+
+def FeatureISAVersion9_4_Generic : FeatureSet<
+ !listconcat(FeatureISAVersion9_4_Common.Features,
+ [FeatureRequiresCOV6])>;
def FeatureISAVersion10_Common : FeatureSet<
[FeatureGFX10,
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 547941633fda61..e12a6127b17063 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -209,6 +209,11 @@ def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel,
FeatureISAVersion9_Generic.Features
>;
+// [gfx940, gfx941, gfx942]
+def : ProcessorModel<"gfx9-4-generic", SIQuarterSpeedModel,
+ FeatureISAVersion9_4_Generic.Features
+>;
+
//===----------------------------------------------------------------------===//
// GCN GFX10.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 5c625c3d83ff1b..507725b91a9bee 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -119,6 +119,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: AK = GK_GFX9_GENERIC; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC: AK = GK_GFX9_4_GENERIC; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC: AK = GK_GFX10_1_GENERIC; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC: AK = GK_GFX10_3_GENERIC; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC: AK = GK_GFX11_GENERIC; break;
@@ -204,6 +205,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
case GK_GFX9_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC;
+ case GK_GFX9_4_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC;
case GK_GFX10_1_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC;
case GK_GFX10_3_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC;
case GK_GFX11_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC;
@@ -821,6 +823,9 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV6() {
case AMDGPU::GK_GFX9_GENERIC:
Version = GenericVersion::GFX9;
break;
+ case AMDGPU::GK_GFX9_4_GENERIC:
+ Version = GenericVersion::GFX9_4;
+ break;
case AMDGPU::GK_GFX10_1_GENERIC:
Version = GenericVersion::GFX10_1;
break;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index beebe320b2cf3a..88a6d75b72c7d0 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -46,6 +46,7 @@ struct IsaVersion;
/// within a generic family.
namespace GenericVersion {
static constexpr unsigned GFX9 = 1;
+static constexpr unsigned GFX9_4 = 1;
static constexpr unsigned GFX10_1 = 1;
static constexpr unsigned GFX10_3 = 1;
static constexpr unsigned GFX11 = 1;
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 059d7b4f5ff2d0..f1a1d78e801e8e 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -130,6 +130,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
{{"gfx1201"}, {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
{{"gfx9-generic"}, {"gfx9-generic"}, GK_GFX9_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK},
+ {{"gfx9-4-generic"}, {"gfx9-4-generic"}, GK_GFX9_4_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx10-1-generic"}, {"gfx10-1-generic"}, GK_GFX10_1_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
{{"gfx10-3-generic"}, {"gfx10-3-generic"}, GK_GFX10_3_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
{{"gfx11-generic"}, {"gfx11-generic"}, GK_GFX11_GENERIC, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP},
@@ -156,6 +157,8 @@ StringRef llvm::AMDGPU::getArchFamilyNameAMDGCN(GPUKind AK) {
switch (AK) {
case AMDGPU::GK_GFX9_GENERIC:
return "gfx9";
+ case AMDGPU::GK_GFX9_4_GENERIC:
+ return "gfx9";
case AMDGPU::GK_GFX10_1_GENERIC:
case AMDGPU::GK_GFX10_3_GENERIC:
return "gfx10";
@@ -296,6 +299,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
// TODO: Split up this API depending on its caller so
// generic target handling is more obvious and less risky.
case GK_GFX9_GENERIC: return {9, 0, 0};
+ case GK_GFX9_4_GENERIC: return {9, 4, 0};
case GK_GFX10_1_GENERIC: return {10, 1, 0};
case GK_GFX10_3_GENERIC: return {10, 3, 0};
case GK_GFX11_GENERIC: return {11, 0, 3};
@@ -466,9 +470,12 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
case GK_GFX942:
case GK_GFX941:
case GK_GFX940:
- Features["gfx940-insts"] = true;
Features["fp8-insts"] = true;
Features["fp8-conversion-insts"] = true;
+ Features["xf32-insts"] = true;
+ [[fallthrough]];
+ case GK_GFX9_4_GENERIC:
+ Features["gfx940-insts"] = true;
Features["atomic-ds-pk-add-16-insts"] = true;
Features["atomic-flat-pk-add-16-insts"] = true;
Features["atomic-global-pk-add-bf16-inst"] = true;
diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
index 5986d2d38ef1ad..4eac26e853c2a0 100644
--- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -112,6 +112,8 @@
; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_NOXNACK %s
; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX9_GENERIC_XNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX9_4_GENERIC_NOXNACK %s
+; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX9_4_GENERIC_XNACK %s
; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=-xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_NOXNACK %s
; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-1-generic -mattr=+xnack < %s | FileCheck --check-prefixes=GFX10_1_GENERIC_XNACK %s
; RUN: llc --amdhsa-code-object-version=6 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx10-3-generic < %s | FileCheck --check-prefixes=GFX10_3_GENERIC %s
@@ -210,6 +212,8 @@
; GFX9_GENERIC_NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack-"
; GFX9_GENERIC_XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx9-generic:xnack+"
+; GFX9_4_GENERIC_NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx9-4-generic:xnack-"
+; GFX9_4_GENERIC_XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx9-4-generic:xnack+"
; GFX10_1_GENERIC_NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx10-1-generic:xnack-"
; GFX10_1_GENERIC_XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx10-1-generic:xnack+"
; GFX10_3_GENERIC: .amdgcn_target "amdgcn-amd-amdhsa--gfx10-3-generic"
diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
index 662de47413654f..10ef4dc280d091 100644
--- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -o - %s | FileCheck -check-prefixes=GFX9_4 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -o - %s | FileCheck -check-prefixes=GFX1030 %s
; Sample test to check how we deal with division/modulos by 64 bit constants.
@@ -66,6 +67,37 @@ define noundef i64 @srem64_3(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: srem64_3:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: s_mov_b32 s2, 0x55555556
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s2
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: s_mov_b32 s3, 0x55555555
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5]
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5]
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_mul_lo_u32 v6, v2, s3
+; GFX9_4-NEXT: v_mul_lo_u32 v7, v2, s2
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, v[4:5]
+; GFX9_4-NEXT: v_add3_u32 v5, v7, v5, v6
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3]
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT: v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT: s_nop 0
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: srem64_3:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -158,6 +190,37 @@ define noundef i64 @srem64_6(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: srem64_6:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: s_mov_b32 s2, 0x55555556
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s2
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: s_mov_b32 s3, 0x55555555
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5]
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5]
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_mul_lo_u32 v6, v2, s3
+; GFX9_4-NEXT: v_mul_lo_u32 v7, v2, s2
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, v[4:5]
+; GFX9_4-NEXT: v_add3_u32 v5, v7, v5, v6
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3]
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT: v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT: s_nop 0
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: srem64_6:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -240,6 +303,32 @@ define noundef i64 @urem64_3(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: urem64_3:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: s_mov_b32 s0, 0xaaaaaaab
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s0
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: s_mov_b32 s2, 0xaaaaaaaa
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT: v_alignbit_b32 v2, v3, v2, 1
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v3, 1, v3
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3]
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT: v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT: s_nop 0
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: urem64_3:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -317,6 +406,32 @@ define noundef i64 @urem64_6(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: urem64_6:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: s_mov_b32 s0, 0xaaaaaaab
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s0
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: s_mov_b32 s2, 0xaaaaaaaa
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT: v_alignbit_b32 v2, v3, v2, 2
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 6, 0
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v3, 2, v3
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 6, v[2:3]
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT: v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT: s_nop 0
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: urem64_6:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -393,6 +508,30 @@ define noundef i64 @sdiv64_3(i64 noundef %i) {
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: sdiv64_3:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: s_mov_b32 s2, 0x55555556
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s2
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: s_mov_b32 s3, 0x55555555
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5]
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v0, 31, v1
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5]
+; GFX9_4-NEXT: v_mul_lo_u32 v2, v0, s3
+; GFX9_4-NEXT: v_mul_lo_u32 v6, v0, s2
+; GFX9_4-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT: v_add3_u32 v1, v6, v1, v2
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: sdiv64_3:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -470,6 +609,30 @@ define noundef i64 @sdiv64_6(i64 noundef %i) {
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: sdiv64_6:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: s_mov_b32 s2, 0x55555556
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s2
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: s_mov_b32 s3, 0x55555555
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s3, v[4:5]
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v0, 31, v1
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s3, v[4:5]
+; GFX9_4-NEXT: v_mul_lo_u32 v2, v0, s3
+; GFX9_4-NEXT: v_mul_lo_u32 v6, v0, s2
+; GFX9_4-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT: v_add3_u32 v1, v6, v1, v2
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: sdiv64_6:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -536,6 +699,25 @@ define noundef i64 @udiv64_3(i64 noundef %i) {
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: udiv64_3:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: s_mov_b32 s0, 0xaaaaaaab
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s0
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: s_mov_b32 s2, 0xaaaaaaaa
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 1
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: udiv64_3:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -596,6 +778,25 @@ define noundef i64 @udiv64_6(i64 noundef %i) {
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 2, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: udiv64_6:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: s_mov_b32 s0, 0xaaaaaaab
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, s0
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: s_mov_b32 s2, 0xaaaaaaaa
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[2:3]
+; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 2
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 2, v1
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: udiv64_6:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -641,6 +842,18 @@ define noundef i64 @srem64_2(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: srem64_2:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: v_and_b32_e32 v2, -2, v2
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9_4-NEXT: s_nop 1
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: srem64_2:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -675,6 +888,15 @@ define noundef i64 @sdiv64_2(i64 noundef %i) {
; GFX942-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: sdiv64_2:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: sdiv64_2:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -703,6 +925,13 @@ define noundef i64 @urem64_2(i64 noundef %i) {
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: urem64_2:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9_4-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: urem64_2:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -729,6 +958,13 @@ define noundef i64 @udiv64_2(i64 noundef %i) {
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: udiv64_2:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 1
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: udiv64_2:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -766,6 +1002,19 @@ define noundef i64 @srem64_64(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: srem64_64:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 26, v2
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: v_and_b32_e32 v2, 0xffffffc0, v2
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9_4-NEXT: s_nop 1
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: srem64_64:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -803,6 +1052,16 @@ define noundef i64 @sdiv64_64(i64 noundef %i) {
; GFX942-NEXT: v_ashrrev_i64 v[0:1], 6, v[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: sdiv64_64:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 26, v2
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: v_ashrrev_i64 v[0:1], 6, v[0:1]
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: sdiv64_64:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -832,6 +1091,13 @@ define noundef i64 @urem64_64(i64 noundef %i) {
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: urem64_64:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX9_4-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: urem64_64:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -858,6 +1124,13 @@ define noundef i64 @udiv64_64(i64 noundef %i) {
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 6, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: udiv64_64:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 6
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 6, v1
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: udiv64_64:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -895,6 +1168,19 @@ define noundef i64 @srem64_i32min(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: srem64_i32min:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: v_and_b32_e32 v2, 0x80000000, v2
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9_4-NEXT: s_nop 1
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: srem64_i32min:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -937,6 +1223,19 @@ define noundef i64 @sdiv64_i32min(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: sdiv64_i32min:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v0
+; GFX9_4-NEXT: s_nop 1
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: sdiv64_i32min:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -979,6 +1278,18 @@ define noundef i64 @urem64_i32min(i64 noundef %i) {
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: urem64_i32min:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_alignbit_b32 v4, v1, v0, 31
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v4, 1, v[2:3]
+; GFX9_4-NEXT: v_add_lshl_u32 v2, v2, v5, 30
+; GFX9_4-NEXT: v_and_b32_e32 v2, 0x80000000, v2
+; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: urem64_i32min:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1025,6 +1336,20 @@ define noundef i64 @udiv64_i32min(i64 noundef %i) {
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: udiv64_i32min:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_alignbit_b32 v2, v1, v0, 31
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GFX9_4-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 1, v[0:1]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v3
+; GFX9_4-NEXT: v_mov_b32_e32 v3, v1
+; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 1
+; GFX9_4-NEXT: v_mov_b32_e32 v1, 0
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: udiv64_i32min:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1117,6 +1442,43 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: srem64_i32max:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, 3
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; GFX9_4-NEXT: v_lshl_add_u32 v8, v6, 31, v6
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, 3, v[2:3]
+; GFX9_4-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, 3, 0
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: s_mov_b32 s2, 0x80000001
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT: v_add3_u32 v7, v7, v8, v6
+; GFX9_4-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, -1, v[6:7]
+; GFX9_4-NEXT: v_mov_b32_e32 v4, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[4:5]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[4:5]
+; GFX9_4-NEXT: v_sub_u32_e32 v2, v7, v1
+; GFX9_4-NEXT: v_sub_u32_e32 v7, v2, v0
+; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, 1, v[4:5]
+; GFX9_4-NEXT: s_brev_b32 s2, -2
+; GFX9_4-NEXT: v_add_u32_e32 v5, v1, v5
+; GFX9_4-NEXT: v_ashrrev_i64 v[6:7], 30, v[4:5]
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[6:7], 0, v[2:3]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3]
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT: v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT: s_nop 0
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: srem64_i32max:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1216,6 +1578,35 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: sdiv64_i32max:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, 3
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; GFX9_4-NEXT: v_lshl_add_u32 v8, v6, 31, v6
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, 3, v[2:3]
+; GFX9_4-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v6, 3, 0
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: s_mov_b32 s2, 0x80000001
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX9_4-NEXT: v_add3_u32 v7, v7, v8, v6
+; GFX9_4-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, -1, v[6:7]
+; GFX9_4-NEXT: v_mov_b32_e32 v4, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, v[4:5]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, s2, v[4:5]
+; GFX9_4-NEXT: v_sub_u32_e32 v2, v7, v1
+; GFX9_4-NEXT: v_sub_u32_e32 v7, v2, v0
+; GFX9_4-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, 1, v[4:5]
+; GFX9_4-NEXT: v_add_u32_e32 v5, v1, v5
+; GFX9_4-NEXT: v_ashrrev_i64 v[0:1], 30, v[4:5]
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: sdiv64_i32max:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1308,6 +1699,36 @@ define noundef i64 @urem64_i32max(i64 noundef %i) {
; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: urem64_i32max:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, 5
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: s_brev_b32 s2, -2
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, 5, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, 2, v[4:5]
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, 2, v[2:3]
+; GFX9_4-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9_4-NEXT: s_nop 1
+; GFX9_4-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9_4-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX9_4-NEXT: v_alignbit_b32 v2, v3, v2, 30
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v3, 30, v3
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3]
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9_4-NEXT: v_mov_b32_e32 v3, v2
+; GFX9_4-NEXT: s_nop 0
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: urem64_i32max:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1383,6 +1804,28 @@ define noundef i64 @udiv64_i32max(i64 noundef %i) {
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
+; GFX9_4-LABEL: udiv64_i32max:
+; GFX9_4: ; %bb.0: ; %entry
+; GFX9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_4-NEXT: v_mul_hi_u32 v2, v0, 5
+; GFX9_4-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_4-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, 5, v[2:3]
+; GFX9_4-NEXT: v_mov_b32_e32 v2, v5
+; GFX9_4-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, 2, v[4:5]
+; GFX9_4-NEXT: v_mov_b32_e32 v6, v5
+; GFX9_4-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX9_4-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, 2, v[2:3]
+; GFX9_4-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9_4-NEXT: s_nop 1
+; GFX9_4-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_4-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX9_4-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX9_4-NEXT: v_alignbit_b32 v0, v1, v0, 30
+; GFX9_4-NEXT: v_lshrrev_b32_e32 v1, 30, v1
+; GFX9_4-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1030-LABEL: udiv64_i32max:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index e24817078d8bc9..524e074bb69de4 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx9-4-generic -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=HAZARD %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=NOHAZARD %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index f293c52bf6bfb2..f1f4edb94a6178 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -80,6 +80,7 @@
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1201 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1201 %s
; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_GENERIC %s
+; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx9-4-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX9_4_GENERIC %s
; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10-1-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_1_GENERIC %s
; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx10-3-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX10_3_GENERIC %s
; RUN: llc -filetype=obj --amdhsa-code-object-version=6 -mtriple=amdgcn -mcpu=gfx11-generic < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX11_GENERIC %s
@@ -161,6 +162,7 @@
; GFX1201: EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E)
; GFX9_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
+; GFX9_4_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC (0x5F)
; GFX10_1_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52)
; GFX10_3_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC (0x53)
; GFX11_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC (0x54)
diff --git a/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
index d30cf1531a06b1..d58dc5db8c80f1 100644
--- a/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
+++ b/llvm/test/CodeGen/AMDGPU/generic-targets-require-v6.ll
@@ -1,16 +1,19 @@
; RUN: not llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX9-V5 %s
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX9-4-V5 %s
; RUN: not llc -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX101-V5 %s
; RUN: not llc -mtriple=amdgcn -mcpu=gfx10-3-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX103-V5 %s
; RUN: not llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX11-V5 %s
; RUN: not llc -mtriple=amdgcn -mcpu=gfx12-generic --amdhsa-code-object-version=5 -o - %s 2>&1 | FileCheck --check-prefix=GFX12-V5 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -o - %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -o - %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx10-1-generic --amdhsa-code-object-version=6 -o - %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx10-3-generic --amdhsa-code-object-version=6 -o - %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx12-generic --amdhsa-code-object-version=6 -o - %s
; GFX9-V5: gfx9-generic is only available on code object version 6 or better
+; GFX9-4-V5: gfx9-4-generic is only available on code object version 6 or better
; GFX101-V5: gfx10-1-generic is only available on code object version 6 or better
; GFX103-V5: gfx10-3-generic is only available on code object version 6 or better
; GFX11-V5: gfx11-generic is only available on code object version 6 or better
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index da1d9972e42dcf..c3f1d47f35ed28 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN9_4 %s
--- |
define amdgpu_kernel void @largeInterleave() #0 { ret void }
@@ -1145,6 +1146,1149 @@
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_endpgm
+ ;
+ ; GCN9_4-LABEL: largeInterleave:
+ ; GCN9_4: ; %bb.0:
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr0
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr1
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr8
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr94
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr132
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr133
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr139
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_readfirstlane_b32 s7, v0
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr0
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN9_4-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr5
+ ; GCN9_4-NEXT: ; iglp_opt mask(0x00000002)
+ ; GCN9_4-NEXT: s_nop 1
+ ; GCN9_4-NEXT: v_lshl_add_u32 v0, s7, 4, v1
+ ; GCN9_4-NEXT: v_mul_lo_u32 v0, v0, s6
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr1
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr6
+ ; GCN9_4-NEXT: v_add_lshl_u32 v92, v0, v1, 1
+ ; GCN9_4-NEXT: v_add_u32_e32 v93, s0, v92
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: s_lshl_b32 s0, s7, 7
+ ; GCN9_4-NEXT: v_add_lshl_u32 v95, v8, s0, 1
+ ; GCN9_4-NEXT: v_add_u32_e32 v8, 64, v93
+ ; GCN9_4-NEXT: ; kill: killed $vgpr8
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN9_4-NEXT: ; kill: killed $vgpr92
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b128 v95, v[0:3]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b128 v95, v[4:7] offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ds_read_b128 v[84:87], v94 offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
+ ; GCN9_4-NEXT: ds_read_b128 v[88:91], v94 offset:1536
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr88
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr84_vgpr85_vgpr86_vgpr87
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1536
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+ ; GCN9_4-NEXT: v_add_u32_e32 v72, 0x80, v93
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: ; kill: killed $vgpr72
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:1536
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1536
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b128 v95, v[64:67]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b128 v95, v[68:71] offset:1024
+ ; GCN9_4-NEXT: v_add_u32_e32 v66, 0xc0, v93
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr64
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr67
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr70
+ ; GCN9_4-NEXT: v_add_u32_e32 v68, v132, v64
+ ; GCN9_4-NEXT: ; kill: killed $vgpr66
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr71
+ ; GCN9_4-NEXT: ; kill: killed $vgpr68
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[72:75], v92, s[8:11], 0 offen offset:192 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[76:79], v66, s[8:11], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_add_u32_e32 v66, v132, v67
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[64:65], v68, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr8
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[68:69], v66, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_add_u32_e32 v66, v132, v70
+ ; GCN9_4-NEXT: v_add_u32_e32 v70, v132, v71
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[66:67], v66, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[70:71], v70, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[84:85], v[48:63]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[86:87], v[48:63]
+ ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[84:85], v[32:47]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[86:87], v[32:47]
+ ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94 offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[84:85], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31]
+ ; GCN9_4-NEXT: ds_read_b128 v[80:83], v94 offset:1536
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[80:81], v[84:85], v[0:15]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[82:83], v[86:87], v[0:15]
+ ; GCN9_4-NEXT: ds_read_b128 v[80:83], v88
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr84_vgpr85_vgpr86_vgpr87
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[84:85], v[48:63]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[86:87], v[48:63]
+ ; GCN9_4-NEXT: ds_read_b128 v[80:83], v88 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[84:85], v[32:47]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[86:87], v[32:47]
+ ; GCN9_4-NEXT: ds_read_b128 v[80:83], v88 offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[84:85], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[86:87], v[16:31]
+ ; GCN9_4-NEXT: ds_read_b128 v[80:83], v88 offset:1536
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b128 v95, v[72:75]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b128 v95, v[76:79] offset:1024
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[80:81], v[84:85], v[0:15]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[82:83], v[86:87], v[0:15]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v94 offset:1536
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1024
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+ ; GCN9_4-NEXT: ds_read_b128 v[72:75], v88 offset:1536
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+ ; GCN9_4-NEXT: v_perm_b32 v72, v68, v64, s5
+ ; GCN9_4-NEXT: v_perm_b32 v64, v68, v64, s8
+ ; GCN9_4-NEXT: v_perm_b32 v68, v69, v65, s5
+ ; GCN9_4-NEXT: v_perm_b32 v73, v70, v66, s5
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+ ; GCN9_4-NEXT: v_perm_b32 v74, v69, v65, s8
+ ; GCN9_4-NEXT: v_perm_b32 v65, v70, v66, s8
+ ; GCN9_4-NEXT: v_perm_b32 v69, v71, v67, s5
+ ; GCN9_4-NEXT: v_perm_b32 v75, v71, v67, s8
+ ; GCN9_4-NEXT: v_mul_f32_e32 v66, s4, v48
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v49
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, s6, v67
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v50
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v51
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v52
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v53
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v54
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v55
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v56
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v57
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v58
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v59
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v60
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v61
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v62
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v63
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v32
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v33
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v34
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v35
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v36
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v37
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v38
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v39
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v40
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v41
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v42
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v43
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v44
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v45
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v46
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v47
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v16
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v17
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v18
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v19
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v20
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v21
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v22
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v23
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v24
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v25
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v26
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v27
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v28
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v29
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v30
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v31
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v0
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v1
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v2
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v3
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v4
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v5
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v6
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v7
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v8
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v9
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v10
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v11
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v67, s4, v12
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v13
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v67, v70
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr67
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr6
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v14
+ ; GCN9_4-NEXT: v_add_u32_e32 v67, s7, v67
+ ; GCN9_4-NEXT: v_and_b32_e32 v67, 0x1fffffff, v67
+ ; GCN9_4-NEXT: v_mul_lo_u32 v67, v67, s6
+ ; GCN9_4-NEXT: v_mul_f32_e32 v71, s4, v15
+ ; GCN9_4-NEXT: v_max3_f32 v66, v66, v70, v71
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr70
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr6_sgpr7
+ ; GCN9_4-NEXT: v_add_lshl_u32 v135, v70, v67, 1
+ ; GCN9_4-NEXT: ds_bpermute_b32 v67, v133, v66
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr70
+ ; GCN9_4-NEXT: v_lshl_add_u32 v136, v70, 1, v135
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v135, v[72:73]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v136, v[64:65]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr64
+ ; GCN9_4-NEXT: v_lshl_add_u32 v137, v64, 1, v136
+ ; GCN9_4-NEXT: v_max_f32_e32 v65, v67, v67
+ ; GCN9_4-NEXT: v_max_f32_e32 v65, v66, v65
+ ; GCN9_4-NEXT: ds_bpermute_b32 v66, v133, v65
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr64
+ ; GCN9_4-NEXT: v_lshl_add_u32 v138, v64, 1, v137
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v137, v[68:69]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr68
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v138, v[74:75]
+ ; GCN9_4-NEXT: v_add_u32_e32 v68, v132, v68
+ ; GCN9_4-NEXT: v_cndmask_b32_e64 v64, v66, v65, s[6:7]
+ ; GCN9_4-NEXT: v_max_f32_e32 v64, v64, v64
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr65
+ ; GCN9_4-NEXT: v_max_f32_e32 v66, v65, v65
+ ; GCN9_4-NEXT: v_max_f32_e32 v134, v66, v64
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr64
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr66
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_add_u32_e32 v64, v132, v64
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_add_u32_e32 v64, v132, v66
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr67
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_add_u32_e32 v64, v132, v67
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_fma_f32 v48, s4, v48, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v57, s4, v57, -v134
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+ ; GCN9_4-NEXT: v_fma_f32 v64, s4, v49, -v134
+ ; GCN9_4-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+ ; GCN9_4-NEXT: v_fma_f32 v66, s4, v50, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v162, v57
+ ; GCN9_4-NEXT: v_exp_f32_e32 v49, v48
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64
+ ; GCN9_4-NEXT: v_fma_f32 v67, s4, v51, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v50, v48
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v66
+ ; GCN9_4-NEXT: v_fma_f32 v68, s4, v52, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v51, v48
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v67
+ ; GCN9_4-NEXT: v_fma_f32 v69, s4, v53, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v52, v48
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v68
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_fma_f32 v70, s4, v54, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v53, v48
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v69
+ ; GCN9_4-NEXT: v_fma_f32 v71, s4, v55, -v134
+ ; GCN9_4-NEXT: ds_read_b128 v[140:143], v139
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v54, v48
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70
+ ; GCN9_4-NEXT: v_exp_f32_e32 v55, v48
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71
+ ; GCN9_4-NEXT: v_fma_f32 v66, s4, v56, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v56, v48
+ ; GCN9_4-NEXT: v_sub_f32_e32 v48, v65, v134
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v64, v49
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v67, v50
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v68, v51
+ ; GCN9_4-NEXT: v_fma_f32 v112, s4, v58, -v134
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v58, v52
+ ; GCN9_4-NEXT: v_exp_f32_e32 v48, v48
+ ; GCN9_4-NEXT: ds_read_b128 v[144:147], v139 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_fma_f32 v160, s4, v59, -v134
+ ; GCN9_4-NEXT: v_pack_b32_f16 v59, v68, v58
+ ; GCN9_4-NEXT: v_pack_b32_f16 v58, v64, v67
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: ds_read_b128 v[148:151], v139 offset:1152
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ds_read_b128 v[152:155], v139 offset:1728
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[140:141], v[58:59], v[96:111]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v66
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v161, v64
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+ ; GCN9_4-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v112
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95]
+ ; GCN9_4-NEXT: v_fma_f32 v144, s4, v61, -v134
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v144
+ ; GCN9_4-NEXT: v_exp_f32_e32 v164, v144
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v140, v53
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v141, v54
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v61, v55
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+ ; GCN9_4-NEXT: v_fma_f32 v145, s4, v62, -v134
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[148:149], v[58:59], v[64:79]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v148, v57
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v57, v56
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_fma_f32 v149, s4, v63, -v134
+ ; GCN9_4-NEXT: v_pack_b32_f16 v63, v61, v57
+ ; GCN9_4-NEXT: v_pack_b32_f16 v62, v140, v141
+ ; GCN9_4-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v160
+ ; GCN9_4-NEXT: v_fma_f32 v60, s4, v60, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v163, s4, v33, -v134
+ ; GCN9_4-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v145
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v160, v161
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v144, v148
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v152, v57
+ ; GCN9_4-NEXT: v_mul_f32_e32 v153, 0x3fb8aa3b, v60
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr57
+ ; GCN9_4-NEXT: ds_read_b128 v[58:61], v57
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_fma_f32 v32, s4, v32, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v40, s4, v40, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v44, s4, v44, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v16, s4, v16, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v24, s4, v24, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v28, s4, v28, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v0, s4, v0, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v8, s4, v8, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v12, s4, v12, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[142:143], v[62:63], v[96:111]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v153, v153
+ ; GCN9_4-NEXT: ds_read_b128 v[140:143], v57 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[62:63], v[80:95]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v146, v162
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[150:151], v[62:63], v[64:79]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v151, v33
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v33, v152
+ ; GCN9_4-NEXT: v_fma_f32 v150, s4, v34, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[62:63], v[112:127]
+ ; GCN9_4-NEXT: v_pack_b32_f16 v63, v144, v33
+ ; GCN9_4-NEXT: v_pack_b32_f16 v62, v160, v146
+ ; GCN9_4-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v149
+ ; GCN9_4-NEXT: v_fma_f32 v149, s4, v35, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v154, v33
+ ; GCN9_4-NEXT: v_fma_f32 v160, s4, v36, -v134
+ ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v163
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v155, v153
+ ; GCN9_4-NEXT: v_fma_f32 v163, s4, v37, -v134
+ ; GCN9_4-NEXT: v_perm_b32 v37, v130, v128, s8
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[58:59], v[62:63], v[96:111]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v32
+ ; GCN9_4-NEXT: ds_read_b128 v[32:35], v57 offset:1152
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ds_read_b128 v[144:147], v57 offset:1728
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v165, v58
+ ; GCN9_4-NEXT: v_perm_b32 v59, v131, v129, s5
+ ; GCN9_4-NEXT: v_perm_b32 v58, v159, v157, s5
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[62:63], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v166, v36
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v141, v151
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v140, v164
+ ; GCN9_4-NEXT: v_perm_b32 v36, v158, v156, s8
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[32:33], v[62:63], v[64:79]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+ ; GCN9_4-NEXT: v_fma_f32 v150, s4, v38, -v134
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v38, v154
+ ; GCN9_4-NEXT: v_exp_f32_e32 v167, v32
+ ; GCN9_4-NEXT: v_perm_b32 v32, v158, v156, s5
+ ; GCN9_4-NEXT: v_perm_b32 v33, v130, v128, s5
+ ; GCN9_4-NEXT: v_pack_b32_f16 v128, v155, v140
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b64 v135, v[32:33]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr33
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v136, v[36:37]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v137, v[58:59]
+ ; GCN9_4-NEXT: v_add_u32_e32 v33, v132, v33
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr36
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr37
+ ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v160
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[62:63], v[112:127]
+ ; GCN9_4-NEXT: v_perm_b32 v63, v131, v129, s8
+ ; GCN9_4-NEXT: v_pack_b32_f16 v129, v141, v38
+ ; GCN9_4-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v149
+ ; GCN9_4-NEXT: v_perm_b32 v62, v159, v157, s8
+ ; GCN9_4-NEXT: v_exp_f32_e32 v155, v38
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr38
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v138, v[62:63]
+ ; GCN9_4-NEXT: v_add_u32_e32 v38, v132, v38
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[130:131], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_add_u32_e32 v33, v132, v36
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_add_u32_e32 v33, v132, v37
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_fma_f32 v149, s4, v39, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[60:61], v[128:129], v[96:111]
+ ; GCN9_4-NEXT: ds_read_b128 v[36:39], v139
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v156, v32
+ ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v163
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v33, v165
+ ; GCN9_4-NEXT: ds_read_b128 v[58:61], v139 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v157, v32
+ ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v142, v166
+ ; GCN9_4-NEXT: v_fma_f32 v143, s4, v41, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v150, s4, v42, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[34:35], v[128:129], v[64:79]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v34, v167
+ ; GCN9_4-NEXT: v_exp_f32_e32 v158, v32
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v32, v155
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
+ ; GCN9_4-NEXT: v_pack_b32_f16 v129, v34, v32
+ ; GCN9_4-NEXT: v_pack_b32_f16 v128, v33, v142
+ ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v149
+ ; GCN9_4-NEXT: v_exp_f32_e32 v146, v32
+ ; GCN9_4-NEXT: ds_read_b128 v[32:35], v139 offset:1152
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_fma_f32 v142, s4, v43, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v149, s4, v46, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[36:37], v[128:129], v[96:111]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v40
+ ; GCN9_4-NEXT: ds_read_b128 v[40:43], v139 offset:1728
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v147, v36
+ ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v143
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v37, v156
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[58:59], v[128:129], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v143, v36
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v58, v157
+ ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v142
+ ; GCN9_4-NEXT: v_fma_f32 v59, s4, v45, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[32:33], v[128:129], v[64:79]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v33, v158
+ ; GCN9_4-NEXT: v_exp_f32_e32 v150, v32
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v32, v146
+ ; GCN9_4-NEXT: v_pack_b32_f16 v33, v33, v32
+ ; GCN9_4-NEXT: v_pack_b32_f16 v32, v37, v58
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v58, v147
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v129, v36
+ ; GCN9_4-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v44
+ ; GCN9_4-NEXT: v_fma_f32 v128, s4, v47, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[38:39], v[32:33], v[96:111]
+ ; GCN9_4-NEXT: ds_read_b128 v[36:39], v57
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v142, v40
+ ; GCN9_4-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v59
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v59, v143
+ ; GCN9_4-NEXT: ds_read_b128 v[44:47], v57 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[32:33], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v61, v40
+ ; GCN9_4-NEXT: v_fma_f32 v60, s4, v17, -v134
+ ; GCN9_4-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v149
+ ; GCN9_4-NEXT: v_pack_b32_f16 v40, v58, v59
+ ; GCN9_4-NEXT: v_fma_f32 v149, s4, v18, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v58, s4, v20, -v134
+ ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v60
+ ; GCN9_4-NEXT: v_fma_f32 v60, s4, v21, -v134
+ ; GCN9_4-NEXT: v_perm_b32 v21, v144, v140, s8
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[34:35], v[32:33], v[64:79]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v34, v150
+ ; GCN9_4-NEXT: v_exp_f32_e32 v159, v17
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v17, v129
+ ; GCN9_4-NEXT: v_pack_b32_f16 v41, v34, v17
+ ; GCN9_4-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v128
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
+ ; GCN9_4-NEXT: v_fma_f32 v42, s4, v19, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v128, v17
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v43, v142
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[36:37], v[40:41], v[96:111]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v16
+ ; GCN9_4-NEXT: ds_read_b128 v[16:19], v57 offset:1152
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ds_read_b128 v[32:35], v57 offset:1728
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v160, v36
+ ; GCN9_4-NEXT: v_perm_b32 v36, v131, v63, s8
+ ; GCN9_4-NEXT: v_perm_b32 v37, v145, v141, s8
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v163, v20
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v45, v159
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v44, v61
+ ; GCN9_4-NEXT: v_perm_b32 v20, v130, v62, s8
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[40:41], v[64:79]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v149
+ ; GCN9_4-NEXT: v_fma_f32 v149, s4, v22, -v134
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v22, v128
+ ; GCN9_4-NEXT: v_exp_f32_e32 v168, v16
+ ; GCN9_4-NEXT: v_perm_b32 v16, v130, v62, s5
+ ; GCN9_4-NEXT: v_perm_b32 v17, v144, v140, s5
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b64 v135, v[16:17]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v136, v[20:21]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr17
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr20
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr21
+ ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v58
+ ; GCN9_4-NEXT: v_add_u32_e32 v17, v132, v17
+ ; GCN9_4-NEXT: v_add_u32_e32 v20, v132, v20
+ ; GCN9_4-NEXT: v_add_u32_e32 v21, v132, v21
+ ; GCN9_4-NEXT: v_fma_f32 v62, s4, v23, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
+ ; GCN9_4-NEXT: v_pack_b32_f16 v41, v45, v22
+ ; GCN9_4-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v42
+ ; GCN9_4-NEXT: v_perm_b32 v32, v131, v63, s5
+ ; GCN9_4-NEXT: v_perm_b32 v33, v145, v141, s5
+ ; GCN9_4-NEXT: v_exp_f32_e32 v63, v22
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr22
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v137, v[32:33]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v138, v[36:37]
+ ; GCN9_4-NEXT: v_add_u32_e32 v22, v132, v22
+ ; GCN9_4-NEXT: v_pack_b32_f16 v40, v43, v44
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[32:33], v22, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: ds_read_b128 v[20:23], v139
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[38:39], v[40:41], v[96:111]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v130, v16
+ ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v60
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v17, v160
+ ; GCN9_4-NEXT: ds_read_b128 v[36:39], v139 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_fma_f32 v131, s4, v26, -v134
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[40:41], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v60, v16
+ ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v149
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v46, v163
+ ; GCN9_4-NEXT: v_fma_f32 v47, s4, v25, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[40:41], v[64:79]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v132, v16
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v18, v168
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v16, v63
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[40:41], v[112:127]
+ ; GCN9_4-NEXT: v_pack_b32_f16 v35, v18, v16
+ ; GCN9_4-NEXT: v_pack_b32_f16 v34, v17, v46
+ ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+ ; GCN9_4-NEXT: v_exp_f32_e32 v41, v16
+ ; GCN9_4-NEXT: ds_read_b128 v[16:19], v139 offset:1152
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_fma_f32 v40, s4, v27, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v62, s4, v30, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[20:21], v[34:35], v[96:111]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v24
+ ; GCN9_4-NEXT: ds_read_b128 v[24:27], v139 offset:1728
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v46, v20
+ ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v47
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v130
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[34:35], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v47, v20
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v36, v60
+ ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v40
+ ; GCN9_4-NEXT: v_fma_f32 v37, s4, v29, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[16:17], v[34:35], v[64:79]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+ ; GCN9_4-NEXT: v_exp_f32_e32 v131, v16
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v17, v132
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v16, v41
+ ; GCN9_4-NEXT: v_pack_b32_f16 v17, v17, v16
+ ; GCN9_4-NEXT: v_pack_b32_f16 v16, v21, v36
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v36, v46
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[34:35], v[112:127]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v35, v20
+ ; GCN9_4-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v28
+ ; GCN9_4-NEXT: v_fma_f32 v34, s4, v31, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[22:23], v[16:17], v[96:111]
+ ; GCN9_4-NEXT: ds_read_b128 v[20:23], v57
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v40, v24
+ ; GCN9_4-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v37
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v37, v47
+ ; GCN9_4-NEXT: ds_read_b128 v[28:31], v57 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
+ ; GCN9_4-NEXT: v_fma_f32 v38, s4, v1, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v39, v24
+ ; GCN9_4-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v62
+ ; GCN9_4-NEXT: v_pack_b32_f16 v24, v36, v37
+ ; GCN9_4-NEXT: v_fma_f32 v62, s4, v2, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v37, s4, v6, -v134
+ ; GCN9_4-NEXT: v_perm_b32 v6, v42, v32, s8
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[18:19], v[16:17], v[64:79]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v140, v1
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v18, v131
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v35
+ ; GCN9_4-NEXT: v_pack_b32_f16 v25, v18, v1
+ ; GCN9_4-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v34
+ ; GCN9_4-NEXT: v_fma_f32 v34, s4, v4, -v134
+ ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v38
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
+ ; GCN9_4-NEXT: v_fma_f32 v26, s4, v3, -v134
+ ; GCN9_4-NEXT: v_exp_f32_e32 v27, v1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[20:21], v[24:25], v[96:111]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v0
+ ; GCN9_4-NEXT: ds_read_b128 v[0:3], v57 offset:1152
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ds_read_b128 v[16:19], v57 offset:1728
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v36, v20
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v40
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v20, v39
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v29, v4
+ ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v26
+ ; GCN9_4-NEXT: v_fma_f32 v28, s4, v5, -v134
+ ; GCN9_4-NEXT: v_perm_b32 v5, v58, v44, s5
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v26, v36
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[24:25], v[64:79]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v62
+ ; GCN9_4-NEXT: v_exp_f32_e32 v38, v0
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v140
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v27
+ ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0
+ ; GCN9_4-NEXT: v_pack_b32_f16 v0, v21, v20
+ ; GCN9_4-NEXT: v_perm_b32 v20, v43, v33, s8
+ ; GCN9_4-NEXT: v_perm_b32 v21, v59, v45, s8
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v25, v4
+ ; GCN9_4-NEXT: v_fma_f32 v24, s4, v7, -v134
+ ; GCN9_4-NEXT: v_perm_b32 v4, v42, v32, s5
+ ; GCN9_4-NEXT: v_perm_b32 v16, v43, v33, s5
+ ; GCN9_4-NEXT: v_perm_b32 v7, v58, v44, s8
+ ; GCN9_4-NEXT: v_perm_b32 v17, v59, v45, s5
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b64 v135, v[4:5]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v136, v[6:7]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v137, v[16:17]
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b64 v138, v[20:21]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v34
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_read_b128 v[4:7], v139
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_fma_f32 v33, s4, v10, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[22:23], v[0:1], v[96:111]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v32, v16
+ ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v28
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v28, v29
+ ; GCN9_4-NEXT: ds_read_b128 v[20:23], v139 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[0:1], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v31, v16
+ ; GCN9_4-NEXT: v_pack_b32_f16 v16, v26, v28
+ ; GCN9_4-NEXT: v_fma_f32 v30, s4, v9, -v134
+ ; GCN9_4-NEXT: v_fma_f32 v28, s4, v14, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[0:1], v[64:79]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v37
+ ; GCN9_4-NEXT: v_exp_f32_e32 v34, v2
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v3, v38
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[0:1], v[112:127]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v25
+ ; GCN9_4-NEXT: v_fma_f32 v18, s4, v11, -v134
+ ; GCN9_4-NEXT: v_pack_b32_f16 v17, v3, v0
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24
+ ; GCN9_4-NEXT: v_exp_f32_e32 v19, v0
+ ; GCN9_4-NEXT: ds_read_b128 v[0:3], v139 offset:1152
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[4:5], v[16:17], v[96:111]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8
+ ; GCN9_4-NEXT: ds_read_b128 v[8:11], v139 offset:1728
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v24, v4
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v5, v32
+ ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v30
+ ; GCN9_4-NEXT: v_exp_f32_e32 v26, v4
+ ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v18, v24
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v20, v31
+ ; GCN9_4-NEXT: v_fma_f32 v21, s4, v13, -v134
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[16:17], v[64:79]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v33
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v34
+ ; GCN9_4-NEXT: v_exp_f32_e32 v30, v0
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v19
+ ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0
+ ; GCN9_4-NEXT: v_pack_b32_f16 v0, v5, v20
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v17, v4
+ ; GCN9_4-NEXT: v_fma_f32 v16, s4, v15, -v134
+ ; GCN9_4-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v12
+ ; GCN9_4-NEXT: v_exp_f32_e32 v20, v8
+ ; GCN9_4-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v21
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v26
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[6:7], v[0:1], v[96:111]
+ ; GCN9_4-NEXT: ds_read_b128 v[4:7], v57
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ds_read_b128 v[12:15], v57 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
+ ; GCN9_4-NEXT: v_exp_f32_e32 v22, v8
+ ; GCN9_4-NEXT: v_pack_b32_f16 v8, v18, v21
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[0:1], v[64:79]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v3, v30
+ ; GCN9_4-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28
+ ; GCN9_4-NEXT: v_exp_f32_e32 v23, v2
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v2, v22
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v17
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v23
+ ; GCN9_4-NEXT: v_pack_b32_f16 v9, v3, v0
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v16
+ ; GCN9_4-NEXT: v_exp_f32_e32 v10, v0
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v3, v20
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v10
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[4:5], v[8:9], v[96:111]
+ ; GCN9_4-NEXT: v_pack_b32_f16 v5, v1, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, 0, v49
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v50, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v51, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v52, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v53, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v54, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v55, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v56, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v161, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v162, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v148, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v152, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v153, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v164, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v151, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v154, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v165, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v166, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v167, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v155, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v156, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v157, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v158, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v146, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v147, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v143, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v150, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v129, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v142, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v61, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v159, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v128, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v160, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v163, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v168, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v63, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v130, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v60, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v132, v0
+ ; GCN9_4-NEXT: v_pack_b32_f16 v4, v3, v2
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v41, v0
+ ; GCN9_4-NEXT: s_nop 0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[6:7], v[4:5], v[96:111]
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v46, v0
+ ; GCN9_4-NEXT: ds_read_b128 v[0:3], v57 offset:1152
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v47, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v131, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v35, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v40, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v39, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v140, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v27, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v36, v6
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[8:9], v[64:79]
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v29, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v38, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v25, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v32, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v31, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v34, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v19, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v24, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v26, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v30, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v17, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v20, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v22, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v23, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v10, v0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[4:5], v[64:79]
+ ; GCN9_4-NEXT: ds_bpermute_b32 v1, v133, v0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[4:5], v[80:95]
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: v_add_f32_e32 v4, v0, v1
+ ; GCN9_4-NEXT: ds_bpermute_b32 v5, v133, v4
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_read_b128 v[0:3], v57 offset:1728
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr0
+ ; GCN9_4-NEXT: v_cndmask_b32_e64 v1, v5, v4, s[6:7]
+ ; GCN9_4-NEXT: v_fmac_f32_e32 v1, v0, v48
+ ; GCN9_4-NEXT: s_endpgm
attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
!0 = !{i64 2862105}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0473e017f193cb..5497d859c97dc6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN9_4 %s
--- |
define amdgpu_kernel void @smallInterleave() #0 { ret void }
@@ -488,6 +489,501 @@
; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16
; GCN-NEXT: s_endpgm
+ ;
+ ; GCN9_4-LABEL: smallInterleave:
+ ; GCN9_4: ; %bb.0:
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr2
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GCN9_4-NEXT: v_readfirstlane_b32 s20, v2
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr0
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr3
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr0_vgpr1
+ ; GCN9_4-NEXT: s_lshl_b32 s4, s20, 7
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr6
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr7
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr49
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr48
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr51
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr68
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr69
+ ; GCN9_4-NEXT: v_max_f32_e32 v79, v69, v69
+ ; GCN9_4-NEXT: v_lshl_add_u32 v2, s20, 4, v3
+ ; GCN9_4-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s0, v2, v[0:1]
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr5
+ ; GCN9_4-NEXT: v_add_lshl_u32 v50, v5, s4, 1
+ ; GCN9_4-NEXT: v_add_u32_e32 v6, v6, v49
+ ; GCN9_4-NEXT: v_add_u32_e32 v7, v7, v49
+ ; GCN9_4-NEXT: ; kill: killed $vgpr7
+ ; GCN9_4-NEXT: ; kill: killed $vgpr6
+ ; GCN9_4-NEXT: ; kill: killed $vgpr4
+ ; GCN9_4-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: s_nop 1
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
+ ; GCN9_4-NEXT: ; iglp_opt mask(0x00000002)
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b128 v50, v[0:3]
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[64:65], v6, s[16:19], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[66:67], v7, s[16:19], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: ds_read_b128 v[36:39], v48
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ds_read_b128 v[44:47], v48 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr6
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr5
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr2
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr3
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr0_sgpr1
+ ; GCN9_4-NEXT: v_perm_b32 v80, v66, v64, s2
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
+ ; GCN9_4-NEXT: v_perm_b32 v64, v66, v64, s3
+ ; GCN9_4-NEXT: v_perm_b32 v66, v67, v65, s2
+ ; GCN9_4-NEXT: v_perm_b32 v65, v67, v65, s3
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+ ; GCN9_4-NEXT: ds_read_b128 v[36:39], v51
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+ ; GCN9_4-NEXT: ds_read_b128 v[44:47], v51 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b128 v50, v[32:35]
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_read_b128 v[32:35], v48
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr44
+ ; GCN9_4-NEXT: v_add_u32_e32 v86, v44, v49
+ ; GCN9_4-NEXT: ; kill: killed $vgpr86
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+ ; GCN9_4-NEXT: ds_read_b128 v[40:43], v48 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
+ ; GCN9_4-NEXT: ds_read_b128 v[32:35], v51
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15]
+ ; GCN9_4-NEXT: ds_read_b128 v[40:43], v51 offset:512
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr32
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr33
+ ; GCN9_4-NEXT: v_add_u32_e32 v32, s20, v32
+ ; GCN9_4-NEXT: v_and_b32_e32 v32, 0x1fffffff, v32
+ ; GCN9_4-NEXT: v_mul_lo_u32 v32, v32, s5
+ ; GCN9_4-NEXT: v_add_lshl_u32 v81, v33, v32, 1
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b32 v81, v80
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr36
+ ; GCN9_4-NEXT: v_lshl_add_u32 v82, v36, 1, v81
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b32 v82, v64
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr37
+ ; GCN9_4-NEXT: v_lshl_add_u32 v83, v37, 1, v82
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr40
+ ; GCN9_4-NEXT: v_lshl_add_u32 v84, v40, 1, v83
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr41
+ ; GCN9_4-NEXT: v_add_u32_e32 v85, v41, v49
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b32 v83, v66
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b32 v84, v65
+ ; GCN9_4-NEXT: ; kill: killed $vgpr85
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15]
+ ; GCN9_4-NEXT: s_nop 7
+ ; GCN9_4-NEXT: s_nop 7
+ ; GCN9_4-NEXT: s_nop 1
+ ; GCN9_4-NEXT: v_mul_f32_e32 v34, s4, v16
+ ; GCN9_4-NEXT: v_mul_f32_e32 v35, s4, v17
+ ; GCN9_4-NEXT: v_mul_f32_e32 v45, s4, v18
+ ; GCN9_4-NEXT: v_mul_f32_e32 v46, s4, v19
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, s6, v35
+ ; GCN9_4-NEXT: v_mul_f32_e32 v47, s4, v20
+ ; GCN9_4-NEXT: v_mul_f32_e32 v48, s4, v21
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v45, v46
+ ; GCN9_4-NEXT: v_mul_f32_e32 v50, s4, v22
+ ; GCN9_4-NEXT: v_mul_f32_e32 v51, s4, v23
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v47, v48
+ ; GCN9_4-NEXT: v_mul_f32_e32 v52, s4, v24
+ ; GCN9_4-NEXT: v_mul_f32_e32 v53, s4, v25
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v50, v51
+ ; GCN9_4-NEXT: v_mul_f32_e32 v54, s4, v26
+ ; GCN9_4-NEXT: v_mul_f32_e32 v55, s4, v27
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v52, v53
+ ; GCN9_4-NEXT: v_mul_f32_e32 v56, s4, v28
+ ; GCN9_4-NEXT: v_mul_f32_e32 v57, s4, v29
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v54, v55
+ ; GCN9_4-NEXT: v_mul_f32_e32 v58, s4, v30
+ ; GCN9_4-NEXT: v_mul_f32_e32 v59, s4, v31
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v56, v57
+ ; GCN9_4-NEXT: v_mul_f32_e32 v35, s4, v0
+ ; GCN9_4-NEXT: v_mul_f32_e32 v38, s4, v1
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v58, v59
+ ; GCN9_4-NEXT: v_mul_f32_e32 v39, s4, v2
+ ; GCN9_4-NEXT: v_mul_f32_e32 v42, s4, v3
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v35, v38
+ ; GCN9_4-NEXT: v_mul_f32_e32 v43, s4, v4
+ ; GCN9_4-NEXT: v_mul_f32_e32 v45, s4, v5
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v39, v42
+ ; GCN9_4-NEXT: v_mul_f32_e32 v46, s4, v6
+ ; GCN9_4-NEXT: v_mul_f32_e32 v47, s4, v7
+ ; GCN9_4-NEXT: v_max3_f32 v34, v34, v43, v45
+ ; GCN9_4-NEXT: v_mul_f32_e32 v70, s4, v8
+ ; GCN9_4-NEXT: v_mul_f32_e32 v71, s4, v9
+ ; GCN9_4-NEXT: v_max3_f32 v78, v34, v46, v47
+ ; GCN9_4-NEXT: v_mul_f32_e32 v72, s4, v10
+ ; GCN9_4-NEXT: v_mul_f32_e32 v73, s4, v11
+ ; GCN9_4-NEXT: v_max3_f32 v70, v78, v70, v71
+ ; GCN9_4-NEXT: v_mul_f32_e32 v74, s4, v12
+ ; GCN9_4-NEXT: v_mul_f32_e32 v75, s4, v13
+ ; GCN9_4-NEXT: v_max3_f32 v70, v70, v72, v73
+ ; GCN9_4-NEXT: v_mul_f32_e32 v76, s4, v14
+ ; GCN9_4-NEXT: v_mul_f32_e32 v77, s4, v15
+ ; GCN9_4-NEXT: v_max3_f32 v70, v70, v74, v75
+ ; GCN9_4-NEXT: v_max3_f32 v70, v70, v76, v77
+ ; GCN9_4-NEXT: ds_bpermute_b32 v71, v68, v70
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: v_max_f32_e32 v64, v71, v71
+ ; GCN9_4-NEXT: v_max_f32_e32 v70, v70, v64
+ ; GCN9_4-NEXT: ds_bpermute_b32 v71, v68, v70
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[64:65], v85, s[16:19], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: buffer_load_dwordx2 v[66:67], v86, s[16:19], 0 offen sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1]
+ ; GCN9_4-NEXT: v_max_f32_e32 v70, v70, v70
+ ; GCN9_4-NEXT: v_max_f32_e32 v72, v79, v70
+ ; GCN9_4-NEXT: v_fma_f32 v16, s4, v16, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v18, s4, v18, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v19, s4, v19, -v72
+ ; GCN9_4-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+ ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18
+ ; GCN9_4-NEXT: v_fma_f32 v17, s4, v17, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v20, s4, v20, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v21, s4, v21, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v22, s4, v22, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v23, s4, v23, -v72
+ ; GCN9_4-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19
+ ; GCN9_4-NEXT: v_exp_f32_e32 v73, v16
+ ; GCN9_4-NEXT: v_exp_f32_e32 v74, v18
+ ; GCN9_4-NEXT: v_exp_f32_e32 v75, v19
+ ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20
+ ; GCN9_4-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+ ; GCN9_4-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+ ; GCN9_4-NEXT: v_exp_f32_e32 v76, v20
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v16, v73
+ ; GCN9_4-NEXT: v_fma_f32 v18, s4, v24, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v77, v21
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v74
+ ; GCN9_4-NEXT: v_fma_f32 v20, s4, v25, -v72
+ ; GCN9_4-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+ ; GCN9_4-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17
+ ; GCN9_4-NEXT: v_exp_f32_e32 v78, v22
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v22, v75
+ ; GCN9_4-NEXT: v_fma_f32 v26, s4, v26, -v72
+ ; GCN9_4-NEXT: v_sub_f32_e32 v24, v69, v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v69, v23
+ ; GCN9_4-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20
+ ; GCN9_4-NEXT: v_fma_f32 v27, s4, v27, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v85, v23
+ ; GCN9_4-NEXT: v_exp_f32_e32 v17, v17
+ ; GCN9_4-NEXT: v_pack_b32_f16 v71, v21, v22
+ ; GCN9_4-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr79
+ ; GCN9_4-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v19, v17
+ ; GCN9_4-NEXT: v_fma_f32 v28, s4, v28, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v29, s4, v29, -v72
+ ; GCN9_4-NEXT: v_pack_b32_f16 v70, v16, v19
+ ; GCN9_4-NEXT: ds_read_b128 v[18:21], v79
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v16, v24
+ ; GCN9_4-NEXT: s_nop 0
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+ ; GCN9_4-NEXT: v_add_f32_e32 v18, 0, v73
+ ; GCN9_4-NEXT: v_fma_f32 v31, s4, v31, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v73, v22
+ ; GCN9_4-NEXT: ds_read_b128 v[22:25], v79 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v19, v76
+ ; GCN9_4-NEXT: v_fma_f32 v0, s4, v0, -v72
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
+ ; GCN9_4-NEXT: v_add_f32_e32 v17, v17, v18
+ ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v80, v77
+ ; GCN9_4-NEXT: v_fma_f32 v23, s4, v30, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v26, v18
+ ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v27
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v22, v78
+ ; GCN9_4-NEXT: v_fma_f32 v1, s4, v1, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v27, v18
+ ; GCN9_4-NEXT: v_add_f32_e32 v17, v74, v17
+ ; GCN9_4-NEXT: v_fma_f32 v4, s4, v4, -v72
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v30, v69
+ ; GCN9_4-NEXT: v_pack_b32_f16 v18, v19, v80
+ ; GCN9_4-NEXT: v_fma_f32 v5, s4, v5, -v72
+ ; GCN9_4-NEXT: ; implicit-def: $vgpr70
+ ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4
+ ; GCN9_4-NEXT: v_pack_b32_f16 v19, v22, v30
+ ; GCN9_4-NEXT: s_nop 1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v28
+ ; GCN9_4-NEXT: v_add_f32_e32 v17, v75, v17
+ ; GCN9_4-NEXT: v_fma_f32 v6, s4, v6, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v28, v20
+ ; GCN9_4-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v29
+ ; GCN9_4-NEXT: v_fma_f32 v7, s4, v7, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v30, v20
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v23
+ ; GCN9_4-NEXT: v_add_f32_e32 v17, v76, v17
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v22, v73
+ ; GCN9_4-NEXT: v_fma_f32 v24, s4, v2, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v25, v18
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v29, v85
+ ; GCN9_4-NEXT: ds_read_b128 v[18:21], v70
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_pack_b32_f16 v22, v22, v29
+ ; GCN9_4-NEXT: v_fma_f32 v29, s4, v3, -v72
+ ; GCN9_4-NEXT: v_add_f32_e32 v17, v77, v17
+ ; GCN9_4-NEXT: v_fma_f32 v10, s4, v10, -v72
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v71, v30
+ ; GCN9_4-NEXT: v_add_f32_e32 v17, v78, v17
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v23, v26
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v2, v27
+ ; GCN9_4-NEXT: v_pack_b32_f16 v23, v23, v2
+ ; GCN9_4-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v31
+ ; GCN9_4-NEXT: v_exp_f32_e32 v31, v2
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[22:23], v[48:63]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v18, v28
+ ; GCN9_4-NEXT: v_exp_f32_e32 v19, v0
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v1
+ ; GCN9_4-NEXT: v_exp_f32_e32 v74, v0
+ ; GCN9_4-NEXT: ds_read_b128 v[0:3], v70 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[22:23], v[32:47]
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v25
+ ; GCN9_4-NEXT: v_fma_f32 v24, s4, v9, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v22, v0
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v31
+ ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0
+ ; GCN9_4-NEXT: v_pack_b32_f16 v0, v18, v71
+ ; GCN9_4-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v29
+ ; GCN9_4-NEXT: s_nop 0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[0:1], v[48:63]
+ ; GCN9_4-NEXT: v_fma_f32 v21, s4, v8, -v72
+ ; GCN9_4-NEXT: v_exp_f32_e32 v18, v18
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v20, v19
+ ; GCN9_4-NEXT: v_exp_f32_e32 v23, v4
+ ; GCN9_4-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v5
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v5, v74
+ ; GCN9_4-NEXT: v_exp_f32_e32 v29, v4
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[0:1], v[32:47]
+ ; GCN9_4-NEXT: v_perm_b32 v0, v66, v64, s2
+ ; GCN9_4-NEXT: v_perm_b32 v1, v66, v64, s3
+ ; GCN9_4-NEXT: v_perm_b32 v2, v67, v65, s2
+ ; GCN9_4-NEXT: v_perm_b32 v3, v67, v65, s3
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: ds_write_b32 v81, v0
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b32 v82, v1
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b32 v83, v2
+ ; GCN9_4-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_write_b32 v84, v3
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v6
+ ; GCN9_4-NEXT: ;;#ASMSTART
+ ; GCN9_4-NEXT: s_waitcnt vmcnt(8)
+ ; GCN9_4-NEXT: ;;#ASMEND
+ ; GCN9_4-NEXT: v_add_f32_e32 v4, v69, v17
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v6, v22
+ ; GCN9_4-NEXT: v_exp_f32_e32 v17, v0
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: ds_read_b128 v[0:3], v79
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v8, v18
+ ; GCN9_4-NEXT: ; implicit-def: $sgpr2
+ ; GCN9_4-NEXT: v_pack_b32_f16 v9, v6, v8
+ ; GCN9_4-NEXT: v_pack_b32_f16 v8, v20, v5
+ ; GCN9_4-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v7
+ ; GCN9_4-NEXT: s_nop 0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v73, v4
+ ; GCN9_4-NEXT: v_exp_f32_e32 v20, v5
+ ; GCN9_4-NEXT: ds_read_b128 v[4:7], v79 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v21
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+ ; GCN9_4-NEXT: v_add_f32_e32 v4, v85, v0
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10
+ ; GCN9_4-NEXT: v_exp_f32_e32 v8, v0
+ ; GCN9_4-NEXT: v_exp_f32_e32 v64, v1
+ ; GCN9_4-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v24
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v24, v29
+ ; GCN9_4-NEXT: v_exp_f32_e32 v65, v1
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v21, v23
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v17
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v20
+ ; GCN9_4-NEXT: v_fma_f32 v9, s4, v15, -v72
+ ; GCN9_4-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9
+ ; GCN9_4-NEXT: v_exp_f32_e32 v9, v9
+ ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0
+ ; GCN9_4-NEXT: v_pack_b32_f16 v0, v21, v24
+ ; GCN9_4-NEXT: s_nop 1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+ ; GCN9_4-NEXT: v_add_f32_e32 v2, v26, v4
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v4, v64
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v27, v2
+ ; GCN9_4-NEXT: v_fma_f32 v11, s4, v11, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v12, s4, v12, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v5, s4, v14, -v72
+ ; GCN9_4-NEXT: v_fma_f32 v13, s4, v13, -v72
+ ; GCN9_4-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12
+ ; GCN9_4-NEXT: v_mul_f32_e32 v10, 0x3fb8aa3b, v11
+ ; GCN9_4-NEXT: v_exp_f32_e32 v11, v3
+ ; GCN9_4-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+ ; GCN9_4-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v5
+ ; GCN9_4-NEXT: v_exp_f32_e32 v10, v10
+ ; GCN9_4-NEXT: v_exp_f32_e32 v13, v3
+ ; GCN9_4-NEXT: v_exp_f32_e32 v7, v0
+ ; GCN9_4-NEXT: ds_read_b128 v[0:3], v70
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v12, v65
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v5, v8
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v14, v10
+ ; GCN9_4-NEXT: v_pack_b32_f16 v4, v4, v12
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v12, v13
+ ; GCN9_4-NEXT: v_pack_b32_f16 v5, v5, v14
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v14, v11
+ ; GCN9_4-NEXT: s_nop 0
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v0, v9
+ ; GCN9_4-NEXT: v_cvt_f16_f32_e32 v1, v7
+ ; GCN9_4-NEXT: v_pack_b32_f16 v1, v1, v0
+ ; GCN9_4-NEXT: v_pack_b32_f16 v0, v14, v12
+ ; GCN9_4-NEXT: s_nop 1
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v28, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v30, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v25, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v31, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v19, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v74, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v22, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v18, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v23, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v29, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v17, v0
+ ; GCN9_4-NEXT: v_add_f32_e32 v6, v20, v0
+ ; GCN9_4-NEXT: ds_read_b128 v[0:3], v70 offset:576
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: buffer_inv sc0 sc1
+ ; GCN9_4-NEXT: v_add_f32_e32 v2, v64, v6
+ ; GCN9_4-NEXT: v_add_f32_e32 v2, v65, v2
+ ; GCN9_4-NEXT: v_add_f32_e32 v2, v8, v2
+ ; GCN9_4-NEXT: v_add_f32_e32 v2, v10, v2
+ ; GCN9_4-NEXT: v_add_f32_e32 v2, v11, v2
+ ; GCN9_4-NEXT: v_add_f32_e32 v2, v13, v2
+ ; GCN9_4-NEXT: v_add_f32_e32 v2, v7, v2
+ ; GCN9_4-NEXT: v_add_f32_e32 v2, v9, v2
+ ; GCN9_4-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[4:5], v[32:47]
+ ; GCN9_4-NEXT: ds_bpermute_b32 v0, v68, v2
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: v_add_f32_e32 v0, v2, v0
+ ; GCN9_4-NEXT: ds_bpermute_b32 v1, v68, v0
+ ; GCN9_4-NEXT: v_mov_b32_e32 v2, 0
+ ; GCN9_4-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN9_4-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[0:1]
+ ; GCN9_4-NEXT: v_fmac_f32_e32 v0, v2, v16
+ ; GCN9_4-NEXT: s_endpgm
attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
!0 = !{i64 2862105}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
index a3d789c1ccc36f..daec7e9b91e71e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
@@ -2,6 +2,7 @@
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
; DPP control value 337 is valid for 64-bit DPP on gfx942
diff --git a/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll b/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll
index 5201f188afd5f8..df717d0ae497d3 100644
--- a/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-corresponding-integer-type.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 %s -o - | FileCheck %s
define void @no_corresponding_integer_type(i8 %arg, ptr addrspace(1) %ptr) {
; CHECK-LABEL: no_corresponding_integer_type:
diff --git a/llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s b/llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s
new file mode 100644
index 00000000000000..6ade556f21a1d1
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx9_4_generic_unsupported.s
@@ -0,0 +1,104 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx9-4-generic -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx9-4-generic -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --implicit-check-not=error: %s
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4xf32 a[0:15], v[2:3], v[4:5], a[18:33]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf8 v1, 3
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf8_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf8_e64 v5, v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_bf8_sdwa v5, v1 src0_sel:BYTE_0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_fp8 v1, 3
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_fp8_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_fp8_e64 v5, v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_f32_fp8_sdwa v5, v1 src0_sel:BYTE_0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_fp8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_fp8_dpp v[10:11], v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_fp8_sdwa v[10:11], v1 src0_sel:WORD_0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_bf8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_bf8_dpp v[10:11], v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_cvt_pk_f32_bf8_sdwa v[10:11], v1 src0_sel:WORD_0
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[2:3], v[4:5], a[0:15]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[2:3], v[4:5], a[0:15]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[2:3], v[4:5], a[0:15]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[2:3], v[4:5], a[0:15]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[2:3], a[4:7], v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[2:3], a[4:7], v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[2:3], a[4:7], v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[2:3], a[4:7], v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[2:3], a[4:7], v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[2:3], a[4:7], v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[2:3], a[4:7], v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[2:3], a[4:7], v1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
index 37234dba7d9b4c..9c79ea588f6247 100644
--- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -250,6 +250,10 @@
# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX9_GENERIC %s
# RUN: obj2yaml %t.o.AMDGCN_GFX9_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX9_GENERIC %s
+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX9_4_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX9_4_GENERIC
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX9_4_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX9_4_GENERIC %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX9_4_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX9_4_GENERIC %s
+
# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX10_1_GENERIC/' %s | yaml2obj -o %t.o.AMDGCN_GFX10_1_GENERIC
# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX10_1_GENERIC %s
# RUN: obj2yaml %t.o.AMDGCN_GFX10_1_GENERIC | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX10_1_GENERIC %s
@@ -473,6 +477,9 @@
# ELF-AMDGCN-GFX9_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC (0x51)
# YAML-AMDGCN-GFX9_GENERIC: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC ]
+# ELF-AMDGCN-GFX9_4_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC (0x5F)
+# YAML-AMDGCN-GFX9_4_GENERIC: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC ]
+
# ELF-AMDGCN-GFX10_1_GENERIC: EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC (0x52)
# YAML-AMDGCN-GFX10_1_GENERIC: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC ]
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
index c38f6b4e7833cd..45071ecb751321 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
@@ -137,6 +137,12 @@ define amdgpu_kernel void @test_kernel() {
; ----------------------------------GFX9---------------------------------------
;
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-4-generic -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-4-generic %t.o > %t-specify.txt
+; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-generic -filetype=obj -O0 -o %t.o %s
; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-generic %t.o > %t-specify.txt
; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
index 78acbd657b7635..34c22dca3aa183 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
@@ -364,6 +364,9 @@
# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC
# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC -DFLAG_VALUE=0x51
+# RUN: yaml2obj %s -o %t -DABI_VERSION=4 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=4 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC -DFLAG_VALUE=0x5F
+
# RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100
# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1100 -DFLAG_VALUE=0x41
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 7d92a492d8b181..1012cd020d525e 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1641,6 +1641,7 @@ const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1200, "gfx1200"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1201, "gfx1201"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, "gfx9-generic"), \
+ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_4_GENERIC, "gfx9-4-generic"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC, "gfx10-1-generic"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC, "gfx10-3-generic"), \
ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, "gfx11-generic"), \
More information about the llvm-branch-commits
mailing list