[llvm] [AMDGPU] Serialize disjoint MFMA chains to hide DS_READ latency (PR #170242)
Anshil Gandhi via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 2 11:51:50 PST 2025
https://github.com/gandhi56 updated https://github.com/llvm/llvm-project/pull/170242
>From f5d4235cf2e2f0e74a3d08935f249670892d0c1c Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <Anshil.Gandhi at amd.com>
Date: Mon, 17 Nov 2025 21:27:54 -0600
Subject: [PATCH] [AMDGPU] Serialize disjoint MFMA chains to hide DS_READ
latency
This patch identifies disjoint chains of dependent
MFMA instructions (with length >= 2) and stitches
them together into a single execution sequence by
adding artificial dependencies from the tail of
one chain to the head of the next.
Currently, the scheduler may schedule disjoint
MFMA chains too early or interleave them, which
can expose high latencies from their associated
DS_READ operands. By strictly serializing these
MFMA chains, we force subsequent chains to execute
later. This artificial delay increases the distance
between the DS_READ issuance and the consuming MFMA
instruction, effectively hiding the load latency.
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 87 +
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 1879 +++++++++--------
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 756 +++----
.../misched-ds-mfma-order-false-deps.mir | 118 ++
.../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 80 +-
5 files changed, 1573 insertions(+), 1347 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 85addb13aef8d..1107bc7bb5f42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -21,6 +21,7 @@
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/TargetOpcodes.h"
@@ -60,6 +61,10 @@ static cl::opt<bool> UseCostHeur(
"Experimentally, results are mixed, so this should be set on a "
"case-by-case basis."));
+static cl::opt<bool> DisableMfmaChainOrderingDeps(
+ "amdgpu-disable-mfma-chain-order-deps", cl::init(false), cl::Hidden,
+ cl::desc("Disable artificial false dependencies between MFMA chains"));
+
// Components of the mask that determines which instruction types may be may be
// classified into a SchedGroup.
enum class SchedGroupMask {
@@ -2342,6 +2347,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
// Add DAG edges that enforce SCHED_BARRIER ordering.
void addSchedBarrierEdges(SUnit &SU);
+ // Add artificial false-dependencies between MFMA consumers of adjacent
+ // DS_READ_B128 streams to enforce MFMA(newer) -> MFMA(older-last) ordering.
+ void addMfmaFalseDeps();
+
// Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
// not be reordered accross the SCHED_BARRIER. This is used for the base
// SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
@@ -2585,6 +2594,9 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
}
}
+ if (!DisableMfmaChainOrderingDeps && ST.hasMAIInsts())
+ addMfmaFalseDeps();
+
if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
// PipelineSolver performs the mutation by adding the edges it
@@ -2681,6 +2693,81 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
} // namespace
+void IGroupLPDAGMutation::addMfmaFalseDeps() {
+ SmallVector<SUnit *, 10> MFMAChainLeaders;
+ DenseMap<SUnit *, SUnit *> MFMAChainNext;
+ for (auto &SU : DAG->SUnits) {
+ if (!TII->isMFMAorWMMA(*SU.getInstr()) || MFMAChainNext.count(&SU))
+ continue;
+
+ // CurrMFMA marks the start of the chain.
+ SUnit *CurrMFMA = &SU;
+ MFMAChainLeaders.push_back(&SU);
+ while (!CurrMFMA->Succs.empty()) {
+ // Count the number of successor MFMA/WMMA instructions of
+ // the current MFMA instruction.
+ MFMAChainNext[CurrMFMA] = nullptr;
+ unsigned MFMADataDepSuccCount = 0;
+ for (const auto &Succ : CurrMFMA->Succs) {
+ SUnit *SuccSU = Succ.getSUnit();
+ if (!SuccSU->isInstr() || !TII->isMFMAorWMMA(*SuccSU->getInstr()))
+ continue;
+
+ // Check if the successor is MFMA/WMMA and the edge is a data dependency
+ if (Succ.getKind() == SDep::Data) {
+ MFMAChainNext[CurrMFMA] =
+ MFMADataDepSuccCount == 0 ? SuccSU : nullptr;
+ ++MFMADataDepSuccCount;
+ }
+ }
+
+ if (!MFMAChainNext[CurrMFMA])
+ break;
+ CurrMFMA = MFMAChainNext[CurrMFMA];
+ }
+
+ assert(MFMAChainNext[CurrMFMA] == nullptr &&
+ "Expected the last MFMA in the chain to have no successor or "
+ "multiple MFMA/WMMA successors");
+ }
+
+ // Compute the tail and length of each chain in a single loop.
+ auto GetTailAndLength = [&](SUnit *Leader) -> std::pair<SUnit *, unsigned> {
+ unsigned Length = 1;
+ SUnit *Curr = Leader;
+ while (MFMAChainNext.count(Curr)) {
+ if (!MFMAChainNext[Curr])
+ break;
+ Curr = MFMAChainNext[Curr];
+ ++Length;
+ }
+ return {Curr, Length};
+ };
+
+ // Assert that all MFMA chains are ordered by NodeNum
+ // Add artificial false dependencies between MFMA chains if two given
+ // chains are at least 2 SUs long.
+ // Iterate over all pairs of contiguous MFMA chains and add artificial edges
+ // if chains are at least 2 SUs long.
+ for (size_t I = 0; I + 1 < MFMAChainLeaders.size(); ++I) {
+ SUnit *ChainLeaderA = MFMAChainLeaders[I];
+ SUnit *ChainLeaderB = MFMAChainLeaders[I + 1];
+
+ auto [TailA, LengthA] = GetTailAndLength(ChainLeaderA);
+ auto [TailB, LengthB] = GetTailAndLength(ChainLeaderB);
+
+ // Only add if both chains are at least two SUs long.
+ if (LengthA >= 2 && LengthB >= 2) {
+ // Add an artificial dependency edge from the tail of chain A to the
+ // leader of chain B.
+ LLVM_DEBUG(dbgs() << "Adding artificial dependency edge from "
+ << TailA->NodeNum << " to " << ChainLeaderB->NodeNum
+ << "\n");
+ DAG->addEdge(ChainLeaderB, SDep(TailA, SDep::Artificial));
+ }
+ }
+}
+
/// \p Phase specifes whether or not this is a reentry into the
/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
/// same scheduling region (e.g. pre and post-RA scheduling / multiple
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index 689d1472d6010..3258c19097721 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -6,1147 +6,1172 @@
define amdgpu_kernel void @largeInterleave() #0 { ret void }
; GCN-LABEL: largeInterleave:
; GCN: ; %bb.0:
- ; GCN-NEXT: ; implicit-def: $vgpr16
- ; GCN-NEXT: ; implicit-def: $vgpr25
- ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
- ; GCN-NEXT: v_readfirstlane_b32 s17, v16
; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: ; implicit-def: $vgpr17
- ; GCN-NEXT: ; implicit-def: $sgpr15
- ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
- ; GCN-NEXT: s_lshl_b32 s18, s17, 7
- ; GCN-NEXT: ; implicit-def: $vgpr18
- ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1
- ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25
- ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6
- ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1
- ; GCN-NEXT: v_add_u32_e32 v17, s15, v226
- ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: ; implicit-def: $vgpr0
+ ; GCN-NEXT: ; implicit-def: $vgpr2
+ ; GCN-NEXT: ; implicit-def: $vgpr1
+ ; GCN-NEXT: ; implicit-def: $vgpr8
+ ; GCN-NEXT: ; implicit-def: $vgpr45
+ ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131
+ ; GCN-NEXT: ; implicit-def: $vgpr164
+ ; GCN-NEXT: ; implicit-def: $vgpr124_vgpr125_vgpr126_vgpr127
+ ; GCN-NEXT: ; implicit-def: $vgpr116_vgpr117_vgpr118_vgpr119
+ ; GCN-NEXT: ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103
+ ; GCN-NEXT: ; implicit-def: $vgpr165
+ ; GCN-NEXT: ; implicit-def: $vgpr104_vgpr105_vgpr106_vgpr107
+ ; GCN-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95
+ ; GCN-NEXT: ; implicit-def: $vgpr84_vgpr85_vgpr86_vgpr87
+ ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83
+ ; GCN-NEXT: ; implicit-def: $vgpr166
+ ; GCN-NEXT: ; implicit-def: $vgpr167
+ ; GCN-NEXT: ; iglp_opt mask(0x00000002)
+ ; GCN-NEXT: ; implicit-def: $sgpr5
+ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GCN-NEXT: v_readfirstlane_b32 s9, v0
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: ; implicit-def: $sgpr11
+ ; GCN-NEXT: ; implicit-def: $sgpr8
+ ; GCN-NEXT: ; implicit-def: $sgpr10
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_lshl_add_u32 v0, s9, 4, v2
+ ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6
+ ; GCN-NEXT: v_add_lshl_u32 v36, v0, v1, 1
+ ; GCN-NEXT: v_add_u32_e32 v32, s5, v36
+ ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v36, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v32, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_add_u32_e32 v72, 64, v17
- ; GCN-NEXT: ; implicit-def: $vgpr213
- ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155
- ; GCN-NEXT: ; implicit-def: $vgpr246
- ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17
- ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159
- ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147
- ; GCN-NEXT: ; implicit-def: $vgpr19
- ; GCN-NEXT: ; implicit-def: $vgpr26
- ; GCN-NEXT: ; implicit-def: $vgpr27
- ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17
- ; GCN-NEXT: v_add_u32_e32 v231, v19, v26
- ; GCN-NEXT: v_add_u32_e32 v232, v19, v27
- ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: ; implicit-def: $vgpr28
- ; GCN-NEXT: ; implicit-def: $vgpr29
- ; GCN-NEXT: v_add_u32_e32 v233, v19, v28
- ; GCN-NEXT: v_add_u32_e32 v234, v19, v29
- ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143
+ ; GCN-NEXT: s_lshl_b32 s5, s9, 7
+ ; GCN-NEXT: v_add_lshl_u32 v44, v8, s5, 1
+ ; GCN-NEXT: v_add_u32_e32 v8, 64, v32
+ ; GCN-NEXT: ; kill: killed $vgpr8
+ ; GCN-NEXT: v_add_u32_e32 v28, 0x80, v32
+ ; GCN-NEXT: ; kill: killed $vgpr28
+ ; GCN-NEXT: v_add_u32_e32 v37, 0xc0, v32
+ ; GCN-NEXT: ; kill: killed $vgpr37
+ ; GCN-NEXT: ; kill: killed $vgpr36
; GCN-NEXT: ; implicit-def: $sgpr5
- ; GCN-NEXT: ; implicit-def: $sgpr7
- ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151
- ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139
- ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135
- ; GCN-NEXT: ; implicit-def: $vgpr20
- ; GCN-NEXT: v_add_u32_e32 v18, s17, v20
- ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18
- ; GCN-NEXT: ; implicit-def: $sgpr16
- ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16
- ; GCN-NEXT: ; implicit-def: $vgpr21
- ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1
- ; GCN-NEXT: ; implicit-def: $vgpr22
- ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199
- ; GCN-NEXT: ; implicit-def: $vgpr23
- ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200
- ; GCN-NEXT: ; implicit-def: $vgpr24
- ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201
- ; GCN-NEXT: ; implicit-def: $vgpr16
- ; GCN-NEXT: ; implicit-def: $vgpr18
- ; GCN-NEXT: ; implicit-def: $vgpr20
- ; GCN-NEXT: ; implicit-def: $vgpr24
- ; GCN-NEXT: v_add_u32_e32 v247, v19, v24
- ; GCN-NEXT: v_add_u32_e32 v248, v19, v16
- ; GCN-NEXT: v_add_u32_e32 v249, v19, v18
- ; GCN-NEXT: v_add_u32_e32 v250, v19, v20
- ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131
- ; GCN-NEXT: ; implicit-def: $sgpr14
- ; GCN-NEXT: ; implicit-def: $vgpr196
- ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13
- ; GCN-NEXT: ; implicit-def: $vgpr211
- ; GCN-NEXT: v_max_f32_e32 v212, v211, v211
- ; GCN-NEXT: ; implicit-def: $vgpr198
- ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; GCN-NEXT: ; implicit-def: $vgpr32
- ; GCN-NEXT: ; implicit-def: $vgpr33
- ; GCN-NEXT: ; implicit-def: $vgpr34
- ; GCN-NEXT: v_add_u32_e32 v210, v19, v34
- ; GCN-NEXT: v_add_u32_e32 v206, v19, v33
- ; GCN-NEXT: v_add_u32_e32 v205, v19, v32
- ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
- ; GCN-NEXT: ; implicit-def: $vgpr21
- ; GCN-NEXT: ; implicit-def: $vgpr22
- ; GCN-NEXT: ; implicit-def: $vgpr23
- ; GCN-NEXT: ; implicit-def: $vgpr30
- ; GCN-NEXT: ; implicit-def: $vgpr31
- ; GCN-NEXT: v_add_u32_e32 v207, v19, v21
- ; GCN-NEXT: v_add_u32_e32 v208, v19, v22
- ; GCN-NEXT: v_add_u32_e32 v209, v19, v23
- ; GCN-NEXT: v_add_u32_e32 v203, v19, v30
- ; GCN-NEXT: v_add_u32_e32 v204, v19, v31
- ; GCN-NEXT: ; kill: killed $vgpr17
- ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- ; GCN-NEXT: ; implicit-def: $vgpr197
- ; GCN-NEXT: ; iglp_opt mask(0x00000002)
+ ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v230, v[64:67]
+ ; GCN-NEXT: ds_write_b128 v44, v[0:3]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024
+ ; GCN-NEXT: ds_write_b128 v44, v[4:7] offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v36, s[0:3], 0 offen offset:64 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v8, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ds_read_b128 v[64:67], v213
+ ; GCN-NEXT: ds_read_b128 v[8:11], v45
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127]
- ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512
+ ; GCN-NEXT: ds_read_b128 v[12:15], v45 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111]
- ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[16:19], v45 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[8:9], v[128:129], 0
+ ; GCN-NEXT: ds_read_b128 v[136:139], v45 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[172:175], v246
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[10:11], v[130:131], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[8:11], v164
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512
+ ; GCN-NEXT: ds_read_b128 v[20:23], v164 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[24:27], v164 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[132:135], v164 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v230, v[160:163]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95]
+ ; GCN-NEXT: ds_write_b128 v44, v[0:3]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79]
+ ; GCN-NEXT: ds_write_b128 v44, v[4:7] offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v36, s[0:3], 0 offen offset:128 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v28, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[8:9], v[124:125], v[48:63]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ds_read_b128 v[188:191], v213
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[10:11], v[126:127], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[8:11], v45
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512
+ ; GCN-NEXT: ds_read_b128 v[28:31], v45 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[148:151], v45 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[120:123], v45 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127]
- ; GCN-NEXT: ds_read_b128 v[218:221], v246
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[8:9], v[116:117], v[48:63]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[10:11], v[118:119], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[8:11], v164
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512
+ ; GCN-NEXT: ds_read_b128 v[32:35], v164 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[152:155], v164 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127]
- ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[108:111], v164 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v230, v[152:155]
+ ; GCN-NEXT: ds_write_b128 v44, v[0:3]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024
+ ; GCN-NEXT: ds_write_b128 v44, v[4:7] offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v36, s[0:3], 0 offen offset:192 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
- ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[4:7], v37, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[8:9], v[100:101], v[48:63]
+ ; GCN-NEXT: ; implicit-def: $vgpr8
+ ; GCN-NEXT: ; implicit-def: $vgpr9
+ ; GCN-NEXT: ; implicit-def: $vgpr36
+ ; GCN-NEXT: ; implicit-def: $vgpr37
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: v_add_u32_e32 v8, v165, v8
+ ; GCN-NEXT: v_add_u32_e32 v9, v165, v9
+ ; GCN-NEXT: v_add_u32_e32 v36, v165, v36
+ ; GCN-NEXT: v_add_u32_e32 v37, v165, v37
+ ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v8, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v9, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[140:141], v36, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v37, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
- ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127]
- ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7
- ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5
- ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7
- ; GCN-NEXT: ds_read_b128 v[160:163], v213
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[10:11], v[102:103], v[48:63]
+ ; GCN-NEXT: ; kill: killed $vgpr8
+ ; GCN-NEXT: ; kill: killed $vgpr9
+ ; GCN-NEXT: ds_read_b128 v[8:11], v45
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5
- ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7
- ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79]
- ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127]
- ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512
+ ; GCN-NEXT: ; kill: killed $vgpr37
+ ; GCN-NEXT: ; kill: killed $vgpr36
+ ; GCN-NEXT: ds_read_b128 v[36:39], v45 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[156:159], v45 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127]
- ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[112:115], v45 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[184:187], v246
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[8:9], v[104:105], v[48:63]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[10:11], v[106:107], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[8:11], v164
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512
+ ; GCN-NEXT: ds_read_b128 v[40:43], v164 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024
+ ; GCN-NEXT: ds_read_b128 v[160:163], v164 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79]
- ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[96:99], v164 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v230, v[152:155]
+ ; GCN-NEXT: ds_write_b128 v44, v[0:3]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[8:9], v[92:93], v[48:63]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024
+ ; GCN-NEXT: ds_write_b128 v44, v[4:7] offset:1024
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[156:159], v213
+ ; GCN-NEXT: ds_read_b128 v[0:3], v45
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512
+ ; GCN-NEXT: ds_read_b128 v[4:7], v45 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[10:11], v[94:95], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[8:11], v45 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536
+ ; GCN-NEXT: ds_read_b128 v[88:91], v45 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[230:233], v246
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[84:85], v[48:63]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[86:87], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[0:3], v164
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[80:81], v[48:63]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[82:83], v[48:63]
+ ; GCN-NEXT: ds_read_b128 v[0:3], v164 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127]
- ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[12:13], v[128:129], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[14:15], v[130:131], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[124:125], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[126:127], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[28:29], v[116:117], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[30:31], v[118:119], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[32:33], v[100:101], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[34:35], v[102:103], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[104:105], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[106:107], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[40:41], v[92:93], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[42:43], v[94:95], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[84:85], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[86:87], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[0:1], v[80:81], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[2:3], v[82:83], v[64:79]
+ ; GCN-NEXT: ds_read_b128 v[0:3], v164 offset:1024
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127]
- ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[16:17], v[128:129], 0
+ ; GCN-NEXT: ; implicit-def: $vgpr16
+ ; GCN-NEXT: ; implicit-def: $vgpr17
+ ; GCN-NEXT: v_add_u32_e32 v169, v165, v16
+ ; GCN-NEXT: v_add_u32_e32 v170, v165, v17
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[130:131], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr18
+ ; GCN-NEXT: v_add_u32_e32 v171, v165, v18
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[124:125], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[26:27], v[126:127], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[116:117], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr149
+ ; GCN-NEXT: ; implicit-def: $vgpr148
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[150:151], v[118:119], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr150
+ ; GCN-NEXT: ; implicit-def: $vgpr151
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[152:153], v[100:101], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr152
+ ; GCN-NEXT: ; implicit-def: $vgpr153
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[154:155], v[102:103], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr154
+ ; GCN-NEXT: ; implicit-def: $vgpr155
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[156:157], v[104:105], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr156
+ ; GCN-NEXT: ; implicit-def: $vgpr157
+ ; GCN-NEXT: v_add_u32_e32 v182, v165, v157
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[158:159], v[106:107], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr158
+ ; GCN-NEXT: ; implicit-def: $vgpr159
+ ; GCN-NEXT: v_add_u32_e32 v200, v165, v158
+ ; GCN-NEXT: v_add_u32_e32 v201, v165, v159
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[160:161], v[92:93], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr160
+ ; GCN-NEXT: ; implicit-def: $vgpr161
+ ; GCN-NEXT: v_add_u32_e32 v199, v165, v161
+ ; GCN-NEXT: v_add_u32_e32 v202, v165, v160
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[162:163], v[94:95], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr162
+ ; GCN-NEXT: v_add_u32_e32 v168, v165, v162
+ ; GCN-NEXT: ; implicit-def: $vgpr163
+ ; GCN-NEXT: v_max_f32_e32 v172, v163, v163
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[8:9], v[84:85], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[10:11], v[86:87], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[80:81], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[82:83], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[128:129], 0
+ ; GCN-NEXT: v_mul_f32_e32 v137, s4, v54
+ ; GCN-NEXT: v_add_u32_e32 v128, v165, v153
+ ; GCN-NEXT: v_add_u32_e32 v129, v165, v154
+ ; GCN-NEXT: v_add_u32_e32 v136, v165, v156
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[130:131], v[16:31]
+ ; GCN-NEXT: v_add_u32_e32 v130, s9, v149
+ ; GCN-NEXT: v_and_b32_e32 v130, 0x1fffffff, v130
+ ; GCN-NEXT: v_mul_lo_u32 v130, v130, s11
+ ; GCN-NEXT: v_add_lshl_u32 v149, v150, v130, 1
+ ; GCN-NEXT: v_lshl_add_u32 v150, v151, 1, v149
+ ; GCN-NEXT: v_lshl_add_u32 v151, v166, 1, v150
+ ; GCN-NEXT: v_lshl_add_u32 v186, v167, 1, v151
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[132:133], v[124:125], v[16:31]
+ ; GCN-NEXT: v_perm_b32 v124, v146, v142, s5
+ ; GCN-NEXT: v_perm_b32 v125, v144, v140, s5
+ ; GCN-NEXT: v_perm_b32 v138, v146, v142, s8
+ ; GCN-NEXT: v_perm_b32 v139, v144, v140, s8
+ ; GCN-NEXT: v_perm_b32 v142, v147, v143, s5
+ ; GCN-NEXT: v_perm_b32 v140, v147, v143, s8
+ ; GCN-NEXT: v_perm_b32 v143, v145, v141, s5
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31]
+ ; GCN-NEXT: v_perm_b32 v141, v145, v141, s8
+ ; GCN-NEXT: ds_read_b128 v[130:133], v164 offset:1536
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v199, v[238:239]
+ ; GCN-NEXT: ds_write_b64 v149, v[124:125]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v200, v[240:241]
+ ; GCN-NEXT: ds_write_b64 v150, v[138:139]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v201, v[242:243]
+ ; GCN-NEXT: ds_write_b64 v151, v[142:143]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v202, v[244:245]
+ ; GCN-NEXT: ds_write_b64 v186, v[140:141]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[120:121], v[116:117], v[16:31]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[116:117], v168, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111]
- ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[124:125], v169, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[120:121], v170, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[126:127], v171, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v134, s4, v48
+ ; GCN-NEXT: v_mul_f32_e32 v135, s4, v49
+ ; GCN-NEXT: v_mul_f32_e32 v138, s4, v55
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[122:123], v[118:119], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v118, s4, v50
+ ; GCN-NEXT: v_mul_f32_e32 v119, s4, v51
+ ; GCN-NEXT: v_mul_f32_e32 v122, s4, v52
+ ; GCN-NEXT: v_mul_f32_e32 v123, s4, v53
+ ; GCN-NEXT: v_mul_f32_e32 v139, s4, v56
+ ; GCN-NEXT: v_mul_f32_e32 v140, s4, v61
+ ; GCN-NEXT: v_mul_f32_e32 v141, s4, v62
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[108:109], v[100:101], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v100, s4, v57
+ ; GCN-NEXT: v_mul_f32_e32 v101, s4, v58
+ ; GCN-NEXT: v_mul_f32_e32 v108, s4, v59
+ ; GCN-NEXT: v_mul_f32_e32 v109, s4, v60
+ ; GCN-NEXT: v_mul_f32_e32 v142, s4, v63
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95]
- ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5
- ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7
- ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7
- ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5
- ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7
- ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5
- ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[110:111], v[102:103], v[16:31]
+ ; GCN-NEXT: v_max3_f32 v102, v134, s10, v135
+ ; GCN-NEXT: v_max3_f32 v102, v102, v118, v119
+ ; GCN-NEXT: v_max3_f32 v102, v102, v122, v123
+ ; GCN-NEXT: v_max3_f32 v102, v102, v137, v138
+ ; GCN-NEXT: v_max3_f32 v100, v102, v139, v100
+ ; GCN-NEXT: v_max3_f32 v100, v100, v101, v108
+ ; GCN-NEXT: v_max3_f32 v100, v100, v109, v140
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[112:113], v[104:105], v[16:31]
+ ; GCN-NEXT: v_max3_f32 v100, v100, v141, v142
+ ; GCN-NEXT: v_mul_f32_e32 v101, s4, v64
+ ; GCN-NEXT: v_mul_f32_e32 v102, s4, v65
+ ; GCN-NEXT: v_mul_f32_e32 v103, s4, v66
+ ; GCN-NEXT: v_mul_f32_e32 v104, s4, v67
+ ; GCN-NEXT: v_mul_f32_e32 v105, s4, v68
+ ; GCN-NEXT: v_mul_f32_e32 v108, s4, v69
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[114:115], v[106:107], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v106, s4, v70
+ ; GCN-NEXT: v_mul_f32_e32 v107, s4, v71
+ ; GCN-NEXT: v_mul_f32_e32 v109, s4, v72
+ ; GCN-NEXT: v_mul_f32_e32 v110, s4, v73
+ ; GCN-NEXT: v_mul_f32_e32 v111, s4, v74
+ ; GCN-NEXT: v_mul_f32_e32 v112, s4, v75
+ ; GCN-NEXT: v_mul_f32_e32 v113, s4, v76
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[96:97], v[92:93], v[16:31]
+ ; GCN-NEXT: v_max3_f32 v97, v100, v101, v102
+ ; GCN-NEXT: v_max3_f32 v97, v97, v103, v104
+ ; GCN-NEXT: v_max3_f32 v97, v97, v105, v108
+ ; GCN-NEXT: v_max3_f32 v97, v97, v106, v107
+ ; GCN-NEXT: v_mul_f32_e32 v92, s4, v77
+ ; GCN-NEXT: v_mul_f32_e32 v93, s4, v78
+ ; GCN-NEXT: v_mul_f32_e32 v96, s4, v79
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[98:99], v[94:95], v[16:31]
+ ; GCN-NEXT: v_max3_f32 v94, v97, v109, v110
+ ; GCN-NEXT: v_max3_f32 v94, v94, v111, v112
+ ; GCN-NEXT: v_max3_f32 v92, v94, v113, v92
+ ; GCN-NEXT: v_max3_f32 v92, v92, v93, v96
+ ; GCN-NEXT: v_mul_f32_e32 v93, s4, v32
+ ; GCN-NEXT: v_mul_f32_e32 v94, s4, v33
+ ; GCN-NEXT: v_mul_f32_e32 v95, s4, v34
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[88:89], v[84:85], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v84, s4, v35
+ ; GCN-NEXT: v_max3_f32 v92, v92, v93, v94
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v36
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v37
+ ; GCN-NEXT: v_mul_f32_e32 v89, s4, v38
+ ; GCN-NEXT: v_mul_f32_e32 v96, s4, v39
+ ; GCN-NEXT: v_mul_f32_e32 v97, s4, v40
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[90:91], v[86:87], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v98, s4, v41
+ ; GCN-NEXT: v_mul_f32_e32 v86, s4, v42
+ ; GCN-NEXT: v_mul_f32_e32 v87, s4, v43
+ ; GCN-NEXT: v_mul_f32_e32 v90, s4, v44
+ ; GCN-NEXT: v_mul_f32_e32 v91, s4, v45
+ ; GCN-NEXT: v_mul_f32_e32 v99, s4, v46
+ ; GCN-NEXT: v_mul_f32_e32 v100, s4, v47
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[130:131], v[80:81], v[16:31]
+ ; GCN-NEXT: v_max3_f32 v80, v92, v95, v84
+ ; GCN-NEXT: v_max3_f32 v80, v80, v85, v88
+ ; GCN-NEXT: v_max3_f32 v80, v80, v89, v96
+ ; GCN-NEXT: v_max3_f32 v80, v80, v97, v98
+ ; GCN-NEXT: v_max3_f32 v80, v80, v86, v87
+ ; GCN-NEXT: v_max3_f32 v80, v80, v90, v91
+ ; GCN-NEXT: v_max3_f32 v80, v80, v99, v100
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[132:133], v[82:83], v[16:31]
+ ; GCN-NEXT: v_perm_b32 v97, v127, v121, s8
; GCN-NEXT: s_nop 9
- ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112
- ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113
- ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218
- ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114
- ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115
- ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
- ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95]
- ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117
- ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
- ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118
- ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119
- ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
- ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120
- ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79]
- ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
- ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122
- ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123
- ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
- ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124
- ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125
- ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111]
- ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126
- ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127
- ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95]
- ; GCN-NEXT: s_nop 6
- ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96
- ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97
- ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
- ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98
- ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99
- ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
- ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101
- ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
- ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102
- ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103
- ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
- ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104
- ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95]
- ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
- ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106
- ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107
- ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
- ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108
- ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109
- ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110
- ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111
- ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215
- ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80
- ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81
- ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141
- ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83
- ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
- ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84
- ; GCN-NEXT: v_mul_f32_e32 v142, s4, v85
- ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
- ; GCN-NEXT: v_mul_f32_e32 v141, s4, v86
- ; GCN-NEXT: v_mul_f32_e32 v142, s4, v87
- ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
- ; GCN-NEXT: v_mul_f32_e32 v141, s4, v88
- ; GCN-NEXT: v_mul_f32_e32 v142, s4, v89
- ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
- ; GCN-NEXT: v_mul_f32_e32 v141, s4, v90
- ; GCN-NEXT: v_mul_f32_e32 v142, s4, v91
- ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
- ; GCN-NEXT: v_mul_f32_e32 v141, s4, v92
- ; GCN-NEXT: v_mul_f32_e32 v142, s4, v93
- ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
- ; GCN-NEXT: v_mul_f32_e32 v141, s4, v94
- ; GCN-NEXT: v_mul_f32_e32 v142, s4, v95
- ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142
- ; GCN-NEXT: v_mul_f32_e32 v128, s4, v64
- ; GCN-NEXT: v_mul_f32_e32 v129, s4, v65
- ; GCN-NEXT: v_max3_f32 v128, v140, v128, v129
- ; GCN-NEXT: v_mul_f32_e32 v129, s4, v66
- ; GCN-NEXT: v_mul_f32_e32 v130, s4, v67
- ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
- ; GCN-NEXT: v_mul_f32_e32 v129, s4, v68
- ; GCN-NEXT: v_mul_f32_e32 v130, s4, v69
- ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
- ; GCN-NEXT: v_mul_f32_e32 v129, s4, v70
- ; GCN-NEXT: v_mul_f32_e32 v130, s4, v71
- ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
- ; GCN-NEXT: v_mul_f32_e32 v129, s4, v72
- ; GCN-NEXT: v_mul_f32_e32 v130, s4, v73
- ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
- ; GCN-NEXT: v_mul_f32_e32 v129, s4, v74
- ; GCN-NEXT: v_mul_f32_e32 v130, s4, v75
- ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
- ; GCN-NEXT: v_mul_f32_e32 v129, s4, v76
- ; GCN-NEXT: v_mul_f32_e32 v130, s4, v77
- ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
- ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78
- ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79
- ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130
- ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[130:133], v198
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_max_f32_e32 v129, v129, v129
- ; GCN-NEXT: v_max_f32_e32 v128, v128, v129
- ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13]
- ; GCN-NEXT: v_max_f32_e32 v128, v128, v128
- ; GCN-NEXT: v_max_f32_e32 v128, v212, v128
- ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128
- ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113
- ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128
- ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113
- ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128
- ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113
- ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128
- ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113
- ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128
- ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113
- ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128
- ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128
- ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113
- ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128
- ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128
- ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128
- ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112
- ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113
- ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120
- ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128
- ; GCN-NEXT: v_exp_f32_e32 v114, v138
- ; GCN-NEXT: v_exp_f32_e32 v115, v139
- ; GCN-NEXT: v_exp_f32_e32 v116, v140
- ; GCN-NEXT: v_exp_f32_e32 v117, v141
- ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118
- ; GCN-NEXT: v_exp_f32_e32 v118, v142
- ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120
- ; GCN-NEXT: v_exp_f32_e32 v120, v144
- ; GCN-NEXT: v_exp_f32_e32 v113, v112
- ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114
- ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116
- ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129
- ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128
- ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119
- ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115
- ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122
- ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117
- ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128
- ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121
- ; GCN-NEXT: v_exp_f32_e32 v112, v129
- ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122
- ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128
- ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
- ; GCN-NEXT: v_exp_f32_e32 v119, v143
- ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47]
- ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112
- ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112
- ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112
- ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112
- ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112
- ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112
- ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124
- ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119
- ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120
- ; GCN-NEXT: v_exp_f32_e32 v121, v148
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31]
- ; GCN-NEXT: v_exp_f32_e32 v122, v149
- ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126
- ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124
- ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121
- ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125
- ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128
- ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63]
- ; GCN-NEXT: v_exp_f32_e32 v123, v150
- ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127
- ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128
- ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128
- ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128
- ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128
- ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15]
- ; GCN-NEXT: v_exp_f32_e32 v124, v151
- ; GCN-NEXT: ds_read_b128 v[130:133], v197
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122
- ; GCN-NEXT: v_exp_f32_e32 v96, v129
- ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139
- ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136
- ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
- ; GCN-NEXT: v_exp_f32_e32 v97, v125
- ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137
- ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128
- ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124
- ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128
- ; GCN-NEXT: v_exp_f32_e32 v98, v138
- ; GCN-NEXT: v_exp_f32_e32 v99, v127
- ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135
- ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134
- ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15]
- ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96
- ; GCN-NEXT: v_exp_f32_e32 v100, v129
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97
+ ; GCN-NEXT: v_mul_f32_e32 v81, s4, v16
+ ; GCN-NEXT: v_mul_f32_e32 v82, s4, v17
+ ; GCN-NEXT: v_mul_f32_e32 v83, s4, v18
+ ; GCN-NEXT: v_mul_f32_e32 v84, s4, v19
+ ; GCN-NEXT: v_max3_f32 v80, v80, v81, v82
+ ; GCN-NEXT: v_mul_f32_e32 v85, s4, v20
+ ; GCN-NEXT: v_mul_f32_e32 v86, s4, v21
+ ; GCN-NEXT: v_max3_f32 v80, v80, v83, v84
+ ; GCN-NEXT: v_mul_f32_e32 v87, s4, v22
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v23
+ ; GCN-NEXT: v_max3_f32 v80, v80, v85, v86
+ ; GCN-NEXT: v_mul_f32_e32 v89, s4, v24
+ ; GCN-NEXT: v_mul_f32_e32 v90, s4, v25
+ ; GCN-NEXT: v_max3_f32 v80, v80, v87, v88
+ ; GCN-NEXT: v_mul_f32_e32 v91, s4, v26
+ ; GCN-NEXT: v_mul_f32_e32 v92, s4, v27
+ ; GCN-NEXT: v_max3_f32 v80, v80, v89, v90
+ ; GCN-NEXT: v_mul_f32_e32 v93, s4, v28
+ ; GCN-NEXT: v_mul_f32_e32 v94, s4, v29
+ ; GCN-NEXT: v_max3_f32 v80, v80, v91, v92
+ ; GCN-NEXT: v_mul_f32_e32 v95, s4, v30
+ ; GCN-NEXT: v_mul_f32_e32 v96, s4, v31
+ ; GCN-NEXT: v_max3_f32 v80, v80, v93, v94
+ ; GCN-NEXT: v_max3_f32 v80, v80, v95, v96
+ ; GCN-NEXT: ds_bpermute_b32 v81, v152, v80
+ ; GCN-NEXT: v_perm_b32 v95, v127, v121, s5
+ ; GCN-NEXT: v_perm_b32 v91, v126, v120, s5
+ ; GCN-NEXT: v_perm_b32 v93, v126, v120, s8
+ ; GCN-NEXT: v_perm_b32 v90, v124, v116, s5
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_max_f32_e32 v81, v81, v81
+ ; GCN-NEXT: v_max_f32_e32 v80, v80, v81
+ ; GCN-NEXT: ds_bpermute_b32 v81, v152, v80
+ ; GCN-NEXT: v_perm_b32 v92, v124, v116, s8
+ ; GCN-NEXT: v_perm_b32 v94, v125, v117, s5
+ ; GCN-NEXT: v_perm_b32 v96, v125, v117, s8
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_cndmask_b32_e64 v80, v81, v80, s[6:7]
+ ; GCN-NEXT: v_max_f32_e32 v80, v80, v80
+ ; GCN-NEXT: v_max_f32_e32 v215, v172, v80
+ ; GCN-NEXT: v_fma_f32 v85, s4, v49, -v215
+ ; GCN-NEXT: v_fma_f32 v84, s4, v48, -v215
+ ; GCN-NEXT: v_fma_f32 v86, s4, v50, -v215
+ ; GCN-NEXT: v_fma_f32 v87, s4, v51, -v215
+ ; GCN-NEXT: v_fma_f32 v52, s4, v52, -v215
+ ; GCN-NEXT: v_fma_f32 v53, s4, v53, -v215
+ ; GCN-NEXT: v_fma_f32 v54, s4, v54, -v215
+ ; GCN-NEXT: v_fma_f32 v55, s4, v55, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v85, 0x3fb8aa3b, v85
+ ; GCN-NEXT: v_fma_f32 v56, s4, v56, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v84, 0x3fb8aa3b, v84
+ ; GCN-NEXT: v_mul_f32_e32 v86, 0x3fb8aa3b, v86
+ ; GCN-NEXT: v_mul_f32_e32 v87, 0x3fb8aa3b, v87
+ ; GCN-NEXT: v_mul_f32_e32 v52, 0x3fb8aa3b, v52
+ ; GCN-NEXT: v_mul_f32_e32 v53, 0x3fb8aa3b, v53
+ ; GCN-NEXT: v_mul_f32_e32 v54, 0x3fb8aa3b, v54
+ ; GCN-NEXT: v_mul_f32_e32 v55, 0x3fb8aa3b, v55
+ ; GCN-NEXT: v_exp_f32_e32 v165, v85
+ ; GCN-NEXT: v_mul_f32_e32 v85, 0x3fb8aa3b, v56
+ ; GCN-NEXT: ds_read_b128 v[80:83], v148
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_sub_f32_e32 v88, v163, v215
+ ; GCN-NEXT: v_exp_f32_e32 v164, v84
+ ; GCN-NEXT: v_exp_f32_e32 v163, v87
+ ; GCN-NEXT: v_exp_f32_e32 v166, v52
+ ; GCN-NEXT: v_exp_f32_e32 v167, v53
+ ; GCN-NEXT: v_exp_f32_e32 v168, v54
+ ; GCN-NEXT: v_exp_f32_e32 v169, v55
+ ; GCN-NEXT: v_exp_f32_e32 v170, v85
+ ; GCN-NEXT: v_exp_f32_e32 v121, v86
+ ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v88
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v164
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v52, v165
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v121
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v163
+ ; GCN-NEXT: v_exp_f32_e32 v120, v88
+ ; GCN-NEXT: v_pack_b32_f16 v122, v84, v52
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v168
+ ; GCN-NEXT: v_pack_b32_f16 v123, v53, v54
+ ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v169
+ ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v215
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[80:81], v[122:123], v[0:15]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v166
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v81, v167
+ ; GCN-NEXT: v_fma_f32 v58, s4, v58, -v215
+ ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v86, 0x3fb8aa3b, v57
+ ; GCN-NEXT: v_mul_f32_e32 v87, 0x3fb8aa3b, v58
+ ; GCN-NEXT: v_mul_f32_e32 v88, 0x3fb8aa3b, v59
+ ; GCN-NEXT: v_pack_b32_f16 v124, v80, v81
+ ; GCN-NEXT: v_pack_b32_f16 v125, v84, v85
+ ; GCN-NEXT: ds_read_b128 v[48:51], v148 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[56:59], v148 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[52:55], v148 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_exp_f32_e32 v171, v86
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[82:83], v[124:125], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v172, v87
+ ; GCN-NEXT: v_exp_f32_e32 v173, v88
+ ; GCN-NEXT: ds_read_b128 v[86:89], v155
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v215
+ ; GCN-NEXT: v_fma_f32 v61, s4, v61, -v215
+ ; GCN-NEXT: v_fma_f32 v62, s4, v62, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v60
+ ; GCN-NEXT: v_mul_f32_e32 v61, 0x3fb8aa3b, v61
+ ; GCN-NEXT: v_mul_f32_e32 v62, 0x3fb8aa3b, v62
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v170
+ ; GCN-NEXT: v_exp_f32_e32 v174, v60
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v60, v171
+ ; GCN-NEXT: v_exp_f32_e32 v175, v61
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v172
+ ; GCN-NEXT: v_exp_f32_e32 v176, v62
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v62, v173
+ ; GCN-NEXT: v_fma_f32 v63, s4, v63, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v63, 0x3fb8aa3b, v63
+ ; GCN-NEXT: v_pack_b32_f16 v126, v84, v60
+ ; GCN-NEXT: v_pack_b32_f16 v127, v61, v62
+ ; GCN-NEXT: ds_read_b128 v[80:83], v155 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v215
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[86:87], v[126:127], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v177, v63
+ ; GCN-NEXT: ds_read_b128 v[84:87], v155 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[60:63], v155 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v199, v[188:189]
+ ; GCN-NEXT: ds_write_b64 v149, v[90:91]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v200, v[190:191]
+ ; GCN-NEXT: ds_write_b64 v150, v[92:93]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v201, v[192:193]
+ ; GCN-NEXT: ds_write_b64 v151, v[94:95]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v202, v[194:195]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47]
- ; GCN-NEXT: v_exp_f32_e32 v101, v125
- ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131
+ ; GCN-NEXT: ds_write_b64 v186, v[96:97]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[100:101], v182, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143
- ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31]
- ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128
- ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134
- ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[102:103], v128, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v102, v142
- ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v129, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[106:107], v136, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v215
+ ; GCN-NEXT: v_fma_f32 v66, s4, v66, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v64
+ ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_mul_f32_e32 v66, 0x3fb8aa3b, v66
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v90, v174
+ ; GCN-NEXT: v_exp_f32_e32 v178, v64
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v175
+ ; GCN-NEXT: v_exp_f32_e32 v179, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v176
+ ; GCN-NEXT: v_exp_f32_e32 v180, v66
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v177
+ ; GCN-NEXT: v_fma_f32 v67, s4, v67, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+ ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v215
+ ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v215
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99
- ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128
- ; GCN-NEXT: v_exp_f32_e32 v103, v150
- ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128
- ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126
- ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127
- ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5
- ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7
- ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128
- ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15]
- ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7
- ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100
- ; GCN-NEXT: v_exp_f32_e32 v104, v129
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135
- ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101
- ; GCN-NEXT: ds_read_b128 v[130:133], v198
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
- ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135
- ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128
- ; GCN-NEXT: v_exp_f32_e32 v105, v125
- ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102
- ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7
- ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5
- ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31]
- ; GCN-NEXT: v_exp_f32_e32 v106, v156
- ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135
- ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103
- ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128
- ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139
- ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
- ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136
- ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v107, v138
- ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15]
- ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104
- ; GCN-NEXT: v_exp_f32_e32 v108, v129
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47]
- ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128
- ; GCN-NEXT: v_exp_f32_e32 v109, v125
- ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142
- ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131
- ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31]
- ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107
- ; GCN-NEXT: v_exp_f32_e32 v110, v156
- ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128
- ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135
- ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63]
- ; GCN-NEXT: v_exp_f32_e32 v111, v146
- ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
- ; GCN-NEXT: v_exp_f32_e32 v80, v129
- ; GCN-NEXT: ds_read_b128 v[130:133], v197
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139
- ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47]
- ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128
- ; GCN-NEXT: v_exp_f32_e32 v81, v125
- ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144
- ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110
- ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128
- ; GCN-NEXT: v_exp_f32_e32 v82, v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111
- ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137
- ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128
- ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63]
- ; GCN-NEXT: v_exp_f32_e32 v83, v135
- ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134
- ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728
+ ; GCN-NEXT: v_pack_b32_f16 v130, v90, v64
+ ; GCN-NEXT: v_pack_b32_f16 v131, v65, v66
+ ; GCN-NEXT: v_mul_f32_e32 v92, 0x3fb8aa3b, v68
+ ; GCN-NEXT: v_mul_f32_e32 v93, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[130:131], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v181, v67
+ ; GCN-NEXT: ds_read_b128 v[66:69], v148
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v70, s4, v70, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v70
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v178
+ ; GCN-NEXT: v_fma_f32 v65, s4, v72, -v215
+ ; GCN-NEXT: v_exp_f32_e32 v182, v92
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v72, v179
+ ; GCN-NEXT: v_exp_f32_e32 v183, v93
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v92, v180
+ ; GCN-NEXT: v_exp_f32_e32 v184, v70
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v181
+ ; GCN-NEXT: v_fma_f32 v71, s4, v71, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v71, 0x3fb8aa3b, v71
+ ; GCN-NEXT: v_pack_b32_f16 v128, v64, v72
+ ; GCN-NEXT: v_pack_b32_f16 v129, v92, v70
+ ; GCN-NEXT: v_fma_f32 v73, s4, v73, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v93, 0x3fb8aa3b, v73
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[128:129], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v185, v71
+ ; GCN-NEXT: v_mul_f32_e32 v71, 0x3fb8aa3b, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v182
+ ; GCN-NEXT: v_exp_f32_e32 v187, v71
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v71, v183
+ ; GCN-NEXT: v_exp_f32_e32 v188, v93
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v92, v184
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v93, v185
+ ; GCN-NEXT: v_fma_f32 v74, s4, v74, -v215
+ ; GCN-NEXT: v_fma_f32 v75, s4, v75, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v94, 0x3fb8aa3b, v74
+ ; GCN-NEXT: v_mul_f32_e32 v95, 0x3fb8aa3b, v75
+ ; GCN-NEXT: v_fma_f32 v76, s4, v76, -v215
+ ; GCN-NEXT: v_fma_f32 v77, s4, v77, -v215
+ ; GCN-NEXT: v_fma_f32 v78, s4, v78, -v215
+ ; GCN-NEXT: v_fma_f32 v79, s4, v79, -v215
+ ; GCN-NEXT: v_pack_b32_f16 v132, v70, v71
+ ; GCN-NEXT: v_pack_b32_f16 v133, v92, v93
+ ; GCN-NEXT: ds_read_b128 v[88:91], v148 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[72:75], v148 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[64:67], v148 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v96, 0x3fb8aa3b, v76
+ ; GCN-NEXT: v_mul_f32_e32 v97, 0x3fb8aa3b, v77
+ ; GCN-NEXT: v_mul_f32_e32 v98, 0x3fb8aa3b, v78
+ ; GCN-NEXT: v_mul_f32_e32 v99, 0x3fb8aa3b, v79
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[132:133], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v189, v94
+ ; GCN-NEXT: v_exp_f32_e32 v190, v95
+ ; GCN-NEXT: ds_read_b128 v[76:79], v155
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v187
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v188
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v189
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v71, v190
+ ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v215
+ ; GCN-NEXT: v_pack_b32_f16 v134, v68, v69
+ ; GCN-NEXT: v_fma_f32 v33, s4, v33, -v215
+ ; GCN-NEXT: v_pack_b32_f16 v135, v70, v71
+ ; GCN-NEXT: v_fma_f32 v34, s4, v34, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[134:135], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v191, v96
+ ; GCN-NEXT: v_exp_f32_e32 v192, v97
+ ; GCN-NEXT: v_exp_f32_e32 v193, v98
+ ; GCN-NEXT: v_exp_f32_e32 v194, v99
+ ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v33
+ ; GCN-NEXT: v_mul_f32_e32 v34, 0x3fb8aa3b, v34
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v191
+ ; GCN-NEXT: v_exp_f32_e32 v195, v32
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v32, v192
+ ; GCN-NEXT: v_exp_f32_e32 v196, v33
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v33, v193
+ ; GCN-NEXT: v_exp_f32_e32 v197, v34
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v194
+ ; GCN-NEXT: v_fma_f32 v35, s4, v35, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v35, 0x3fb8aa3b, v35
+ ; GCN-NEXT: v_fma_f32 v36, s4, v36, -v215
+ ; GCN-NEXT: v_fma_f32 v37, s4, v37, -v215
+ ; GCN-NEXT: v_fma_f32 v38, s4, v38, -v215
+ ; GCN-NEXT: v_fma_f32 v39, s4, v39, -v215
+ ; GCN-NEXT: v_pack_b32_f16 v136, v76, v32
+ ; GCN-NEXT: v_pack_b32_f16 v137, v33, v34
+ ; GCN-NEXT: v_mul_f32_e32 v77, 0x3fb8aa3b, v36
+ ; GCN-NEXT: v_mul_f32_e32 v108, 0x3fb8aa3b, v37
+ ; GCN-NEXT: v_mul_f32_e32 v109, 0x3fb8aa3b, v38
+ ; GCN-NEXT: v_mul_f32_e32 v110, 0x3fb8aa3b, v39
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[136:137], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v198, v35
+ ; GCN-NEXT: v_perm_b32 v32, v102, v100, s5
+ ; GCN-NEXT: v_perm_b32 v33, v106, v104, s5
+ ; GCN-NEXT: v_perm_b32 v34, v102, v100, s8
+ ; GCN-NEXT: v_perm_b32 v35, v106, v104, s8
+ ; GCN-NEXT: v_perm_b32 v36, v103, v101, s5
+ ; GCN-NEXT: v_perm_b32 v38, v103, v101, s8
+ ; GCN-NEXT: v_perm_b32 v37, v107, v105, s5
+ ; GCN-NEXT: v_perm_b32 v39, v107, v105, s8
+ ; GCN-NEXT: ds_read_b128 v[92:95], v155 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[96:99], v155 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[68:71], v155 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v199, v[126:127]
+ ; GCN-NEXT: ds_write_b64 v149, v[32:33]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v200, v[150:151]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15]
+ ; GCN-NEXT: ds_write_b64 v150, v[34:35]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v201, v[152:153]
+ ; GCN-NEXT: ds_write_b64 v151, v[36:37]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v202, v[154:155]
- ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128
- ; GCN-NEXT: v_exp_f32_e32 v84, v129
- ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47]
- ; GCN-NEXT: v_exp_f32_e32 v85, v125
- ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: ds_write_b64 v186, v[38:39]
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v199, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81
- ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31]
- ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128
- ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134
- ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[106:107], v200, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[116:117], v201, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[118:119], v202, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82
- ; GCN-NEXT: v_exp_f32_e32 v86, v156
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128
- ; GCN-NEXT: v_exp_f32_e32 v87, v157
- ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138
- ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128
- ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15]
- ; GCN-NEXT: ; implicit-def: $sgpr0
- ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5
- ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7
- ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128
- ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5
- ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7
- ; GCN-NEXT: ds_read_b128 v[130:133], v198
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84
- ; GCN-NEXT: v_exp_f32_e32 v88, v129
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135
- ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85
- ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5
- ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7
- ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5
- ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7
- ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47]
- ; GCN-NEXT: v_exp_f32_e32 v89, v125
- ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135
- ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86
- ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128
- ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138
- ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31]
- ; GCN-NEXT: v_exp_f32_e32 v90, v158
- ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87
- ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128
- ; GCN-NEXT: v_exp_f32_e32 v91, v139
- ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127
- ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126
- ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
- ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88
- ; GCN-NEXT: v_exp_f32_e32 v92, v129
- ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130
- ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89
- ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128
- ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47]
- ; GCN-NEXT: v_exp_f32_e32 v93, v125
- ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90
- ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126
- ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91
- ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31]
- ; GCN-NEXT: v_exp_f32_e32 v94, v148
- ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63]
- ; GCN-NEXT: v_exp_f32_e32 v95, v127
- ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92
- ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131
- ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126
- ; GCN-NEXT: s_nop 1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15]
- ; GCN-NEXT: v_exp_f32_e32 v125, v129
- ; GCN-NEXT: ds_read_b128 v[132:135], v197
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47]
- ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65
- ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128
- ; GCN-NEXT: v_exp_f32_e32 v126, v142
- ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64
- ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94
- ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65
- ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95
- ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31]
- ; GCN-NEXT: v_exp_f32_e32 v127, v143
- ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63]
- ; GCN-NEXT: v_exp_f32_e32 v129, v138
- ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66
- ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728
+ ; GCN-NEXT: ds_read_b128 v[36:39], v148
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v76, v195
+ ; GCN-NEXT: v_exp_f32_e32 v199, v77
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v196
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v78, v197
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v79, v198
+ ; GCN-NEXT: v_fma_f32 v40, s4, v40, -v215
+ ; GCN-NEXT: v_pack_b32_f16 v138, v76, v77
+ ; GCN-NEXT: v_fma_f32 v41, s4, v41, -v215
+ ; GCN-NEXT: v_pack_b32_f16 v139, v78, v79
+ ; GCN-NEXT: v_fma_f32 v42, s4, v42, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v40
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[36:37], v[138:139], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v200, v108
+ ; GCN-NEXT: v_exp_f32_e32 v201, v109
+ ; GCN-NEXT: v_exp_f32_e32 v202, v110
+ ; GCN-NEXT: v_mul_f32_e32 v41, 0x3fb8aa3b, v41
+ ; GCN-NEXT: v_mul_f32_e32 v42, 0x3fb8aa3b, v42
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v199
+ ; GCN-NEXT: v_exp_f32_e32 v203, v40
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v200
+ ; GCN-NEXT: v_exp_f32_e32 v204, v41
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v41, v201
+ ; GCN-NEXT: v_exp_f32_e32 v205, v42
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v42, v202
+ ; GCN-NEXT: v_fma_f32 v43, s4, v43, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v43, 0x3fb8aa3b, v43
+ ; GCN-NEXT: v_pack_b32_f16 v140, v36, v40
+ ; GCN-NEXT: v_pack_b32_f16 v141, v41, v42
+ ; GCN-NEXT: ds_read_b128 v[32:35], v148 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[108:111], v148 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[76:79], v148 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[38:39], v[140:141], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v206, v43
+ ; GCN-NEXT: ds_read_b128 v[40:43], v155
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v37, s4, v44, -v215
+ ; GCN-NEXT: v_fma_f32 v44, s4, v45, -v215
+ ; GCN-NEXT: v_fma_f32 v45, s4, v46, -v215
+ ; GCN-NEXT: v_fma_f32 v46, s4, v47, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v47, 0x3fb8aa3b, v37
+ ; GCN-NEXT: v_mul_f32_e32 v44, 0x3fb8aa3b, v44
+ ; GCN-NEXT: v_mul_f32_e32 v45, 0x3fb8aa3b, v45
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v100, v203
+ ; GCN-NEXT: v_exp_f32_e32 v207, v47
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v47, v204
+ ; GCN-NEXT: v_exp_f32_e32 v208, v44
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v205
+ ; GCN-NEXT: v_exp_f32_e32 v209, v45
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v206
+ ; GCN-NEXT: v_mul_f32_e32 v46, 0x3fb8aa3b, v46
+ ; GCN-NEXT: v_pack_b32_f16 v142, v100, v47
+ ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v215
+ ; GCN-NEXT: v_pack_b32_f16 v143, v44, v45
+ ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v215
+ ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v215
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[142:143], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v210, v46
+ ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+ ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17
+ ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v40, v207
+ ; GCN-NEXT: v_exp_f32_e32 v211, v16
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v208
+ ; GCN-NEXT: v_exp_f32_e32 v212, v17
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v209
+ ; GCN-NEXT: v_exp_f32_e32 v213, v18
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v210
+ ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v215
+ ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19
+ ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v215
+ ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v215
+ ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v215
+ ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v215
+ ; GCN-NEXT: v_pack_b32_f16 v144, v40, v16
+ ; GCN-NEXT: v_pack_b32_f16 v145, v17, v18
+ ; GCN-NEXT: v_mul_f32_e32 v44, 0x3fb8aa3b, v20
+ ; GCN-NEXT: v_mul_f32_e32 v45, 0x3fb8aa3b, v21
+ ; GCN-NEXT: v_mul_f32_e32 v46, 0x3fb8aa3b, v22
+ ; GCN-NEXT: v_mul_f32_e32 v47, 0x3fb8aa3b, v23
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[144:145], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v214, v19
+ ; GCN-NEXT: v_perm_b32 v16, v106, v104, s5
+ ; GCN-NEXT: v_perm_b32 v17, v118, v116, s5
+ ; GCN-NEXT: v_perm_b32 v18, v106, v104, s8
+ ; GCN-NEXT: v_perm_b32 v19, v118, v116, s8
+ ; GCN-NEXT: v_perm_b32 v20, v107, v105, s5
+ ; GCN-NEXT: v_perm_b32 v22, v107, v105, s8
+ ; GCN-NEXT: v_perm_b32 v21, v119, v117, s5
+ ; GCN-NEXT: v_perm_b32 v23, v119, v117, s8
+ ; GCN-NEXT: ds_read_b128 v[36:39], v155 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[112:115], v155 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[100:103], v155 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b64 v199, v[150:151]
+ ; GCN-NEXT: ds_write_b64 v149, v[16:17]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v200, v[152:153]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125
- ; GCN-NEXT: v_exp_f32_e32 v130, v158
+ ; GCN-NEXT: ds_write_b64 v150, v[18:19]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v201, v[154:155]
+ ; GCN-NEXT: ds_write_b64 v151, v[20:21]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v202, v[156:157]
+ ; GCN-NEXT: ds_write_b64 v186, v[22:23]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47]
- ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68
- ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126
- ; GCN-NEXT: v_exp_f32_e32 v131, v144
- ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69
- ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128
- ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68
- ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127
- ; GCN-NEXT: v_exp_f32_e32 v132, v145
- ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128
- ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65
- ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128
- ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63]
- ; GCN-NEXT: v_exp_f32_e32 v133, v141
- ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69
- ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[68:71], v198
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15]
- ; GCN-NEXT: v_exp_f32_e32 v72, v146
- ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143
- ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131
- ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47]
- ; GCN-NEXT: v_exp_f32_e32 v73, v144
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132
- ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128
- ; GCN-NEXT: v_exp_f32_e32 v74, v65
- ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133
- ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67
- ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63]
- ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128
- ; GCN-NEXT: v_exp_f32_e32 v75, v142
- ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138
- ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
- ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128
- ; GCN-NEXT: v_exp_f32_e32 v76, v146
- ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68
- ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73
- ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47]
- ; GCN-NEXT: v_exp_f32_e32 v77, v147
- ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68
- ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74
- ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31]
- ; GCN-NEXT: v_exp_f32_e32 v78, v67
- ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68
- ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75
- ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128
- ; GCN-NEXT: v_exp_f32_e32 v79, v148
- ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65
- ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64
- ; GCN-NEXT: s_nop 1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15]
- ; GCN-NEXT: v_exp_f32_e32 v142, v146
- ; GCN-NEXT: ds_read_b128 v[68:71], v197
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
- ; GCN-NEXT: v_exp_f32_e32 v137, v147
- ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
- ; GCN-NEXT: v_exp_f32_e32 v138, v138
- ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
- ; GCN-NEXT: s_nop 10
- ; GCN-NEXT: v_exp_f32_e32 v52, v128
- ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137
- ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142
- ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138
- ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52
- ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79
- ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50
- ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136
- ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53
- ; GCN-NEXT: v_add_f32_e32 v53, 0, v113
- ; GCN-NEXT: v_add_f32_e32 v53, v114, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v115, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v116, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v117, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v118, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v119, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v120, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v121, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v122, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v123, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v124, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v96, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v97, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v98, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v99, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v100, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v101, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v102, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v103, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v104, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v105, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v106, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v107, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v108, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v109, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v110, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v111, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v80, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v81, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v82, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v83, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v84, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v85, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v86, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v87, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v88, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v89, v53
- ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49
- ; GCN-NEXT: v_add_f32_e32 v53, v90, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v91, v53
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15]
- ; GCN-NEXT: v_add_f32_e32 v53, v92, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v93, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v94, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v95, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v125, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v126, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v127, v53
- ; GCN-NEXT: v_add_f32_e32 v53, v129, v53
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47]
- ; GCN-NEXT: s_nop 9
- ; GCN-NEXT: v_add_f32_e32 v0, v130, v53
- ; GCN-NEXT: v_add_f32_e32 v0, v131, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v132, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v133, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v72, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v73, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v74, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v75, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v76, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v77, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v78, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v79, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v142, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v137, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v138, v0
- ; GCN-NEXT: v_add_f32_e32 v4, v52, v0
- ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31]
- ; GCN-NEXT: v_add_f32_e32 v2, v4, v5
- ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2
- ; GCN-NEXT: ; implicit-def: $vgpr4
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13]
- ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112
- ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[16:19], v148
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v211
+ ; GCN-NEXT: v_fma_f32 v21, s4, v24, -v215
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v212
+ ; GCN-NEXT: v_fma_f32 v23, s4, v25, -v215
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v24, v213
+ ; GCN-NEXT: v_fma_f32 v25, s4, v26, -v215
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v26, v214
+ ; GCN-NEXT: v_pack_b32_f16 v146, v20, v22
+ ; GCN-NEXT: v_fma_f32 v27, s4, v27, -v215
+ ; GCN-NEXT: ds_read_b128 v[40:43], v148 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_pack_b32_f16 v147, v24, v26
+ ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+ ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[146:147], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v186, v44
+ ; GCN-NEXT: v_exp_f32_e32 v216, v45
+ ; GCN-NEXT: v_exp_f32_e32 v217, v46
+ ; GCN-NEXT: v_exp_f32_e32 v218, v47
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v186
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v216
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v217
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v24, v218
+ ; GCN-NEXT: v_mul_f32_e32 v25, 0x3fb8aa3b, v25
+ ; GCN-NEXT: v_mul_f32_e32 v27, 0x3fb8aa3b, v27
+ ; GCN-NEXT: ds_read_b128 v[116:119], v148 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[104:107], v148 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_pack_b32_f16 v148, v16, v20
+ ; GCN-NEXT: v_pack_b32_f16 v149, v22, v24
+ ; GCN-NEXT: v_fma_f32 v17, s4, v28, -v215
+ ; GCN-NEXT: ds_read_b128 v[44:47], v155
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[148:149], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v219, v21
+ ; GCN-NEXT: v_fma_f32 v21, s4, v29, -v215
+ ; GCN-NEXT: v_exp_f32_e32 v220, v23
+ ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v215
+ ; GCN-NEXT: v_exp_f32_e32 v221, v25
+ ; GCN-NEXT: v_fma_f32 v25, s4, v31, -v215
+ ; GCN-NEXT: v_exp_f32_e32 v215, v27
+ ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v219
+ ; GCN-NEXT: v_exp_f32_e32 v222, v17
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v220
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v221
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v215
+ ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+ ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+ ; GCN-NEXT: v_mul_f32_e32 v25, 0x3fb8aa3b, v25
+ ; GCN-NEXT: v_pack_b32_f16 v150, v16, v17
+ ; GCN-NEXT: v_pack_b32_f16 v151, v18, v19
+ ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: v_add_u32_e32 v162, s0, v162
+ ; GCN-NEXT: v_add_u32_e32 v157, s0, v157
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[150:151], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v223, v21
+ ; GCN-NEXT: v_exp_f32_e32 v224, v23
+ ; GCN-NEXT: v_exp_f32_e32 v225, v25
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v222
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v226, v223
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v227, v224
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v45, v225
+ ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; GCN-NEXT: v_add_u32_e32 v161, s0, v161
+ ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pack_b32_f16 v226, v44, v226
+ ; GCN-NEXT: v_pack_b32_f16 v227, v227, v45
+ ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[226:227], v[0:15]
+ ; GCN-NEXT: v_add_u32_e32 v160, s0, v160
+ ; GCN-NEXT: v_add_u32_e32 v159, s0, v159
+ ; GCN-NEXT: v_add_u32_e32 v158, s0, v158
+ ; GCN-NEXT: v_add_u32_e32 v156, s0, v156
+ ; GCN-NEXT: v_add_u32_e32 v154, s0, v154
+ ; GCN-NEXT: s_nop 5
+ ; GCN-NEXT: ds_read_b128 v[0:3], v155 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_add_u32_e32 v153, s0, v153
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[48:49], v[122:123], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[50:51], v[124:125], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[126:127], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[130:131], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[88:89], v[128:129], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[90:91], v[132:133], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[92:93], v[134:135], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[94:95], v[136:137], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[138:139], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[140:141], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[142:143], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[144:145], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[40:41], v[146:147], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[42:43], v[148:149], v[16:31]
+ ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; GCN-NEXT: v_mul_f32_e64 v32, v32, v120
+ ; GCN-NEXT: v_mul_f32_e64 v33, v33, v120
+ ; GCN-NEXT: v_mul_f32_e64 v34, v34, v120
+ ; GCN-NEXT: v_mul_f32_e64 v35, v35, v120
+ ; GCN-NEXT: v_mul_f32_e64 v36, v36, v120
+ ; GCN-NEXT: v_mul_f32_e64 v37, v37, v120
+ ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[150:151], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e64 v44, v44, v120
+ ; GCN-NEXT: v_mul_f32_e64 v45, v45, v120
+ ; GCN-NEXT: v_mul_f32_e64 v46, v46, v120
+ ; GCN-NEXT: v_mul_f32_e64 v47, v47, v120
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[2:3], v[226:227], v[16:31]
+ ; GCN-NEXT: s_nop 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[56:57], v[122:123], v[32:47]
+ ; GCN-NEXT: s_nop 8
+ ; GCN-NEXT: ds_read_b128 v[14:17], v155 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[58:59], v[124:125], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[84:85], v[126:127], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[86:87], v[130:131], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[128:129], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[132:133], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[96:97], v[134:135], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[98:99], v[136:137], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[108:109], v[138:139], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[110:111], v[140:141], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[112:113], v[142:143], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[114:115], v[144:145], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[116:117], v[146:147], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[118:119], v[148:149], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[14:15], v[150:151], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: v_mul_f32_e64 v0, v0, v120
+ ; GCN-NEXT: v_mul_f32_e64 v1, v1, v120
+ ; GCN-NEXT: v_mul_f32_e64 v2, v2, v120
+ ; GCN-NEXT: v_mul_f32_e64 v3, v3, v120
+ ; GCN-NEXT: v_mul_f32_e64 v4, v4, v120
+ ; GCN-NEXT: v_mul_f32_e64 v5, v5, v120
+ ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[120:121] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[16:17], v[226:227], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr16
+ ; GCN-NEXT: v_mov_b32_e32 v20, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, 0, v164
+ ; GCN-NEXT: v_add_f32_e32 v16, v165, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v121, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v163, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v166, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v167, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[52:53], v[122:123], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v168, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v169, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v170, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v171, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v172, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v173, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v174, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[54:55], v[124:125], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v175, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v176, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v177, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v178, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v179, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v180, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v181, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[60:61], v[126:127], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v182, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v183, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v184, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v185, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v187, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v188, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v189, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[62:63], v[130:131], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v190, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v191, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v192, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v193, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v194, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v195, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v196, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[128:129], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v197, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v198, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v199, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v200, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v201, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v202, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v203, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[132:133], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v204, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v205, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v206, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v207, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v208, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v209, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v210, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[134:135], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v211, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v212, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v213, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v214, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v186, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v216, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v217, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[136:137], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v218, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v219, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v220, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v221, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v215, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v222, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v223, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[138:139], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v224, v16
+ ; GCN-NEXT: v_add_f32_e32 v21, v225, v16
+ ; GCN-NEXT: ds_read_b128 v[16:19], v155 offset:1728
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_bpermute_b32 v18, v152, v21
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47]
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: v_add_f32_e32 v18, v21, v18
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[140:141], v[0:15]
+ ; GCN-NEXT: ds_bpermute_b32 v19, v152, v18
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[100:101], v[142:143], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[102:103], v[144:145], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[104:105], v[146:147], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[106:107], v[148:149], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[150:151], v[0:15]
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: s_nop 9
+ ; GCN-NEXT: v_cndmask_b32_e64 v0, v19, v18, s[6:7]
+ ; GCN-NEXT: v_fmac_f32_e32 v0, v20, v120
; GCN-NEXT: s_endpgm
attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..d011c64ae1e43 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -8,486 +8,486 @@
; GCN: ; %bb.0:
; GCN-NEXT: ; implicit-def: $vgpr2
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
- ; GCN-NEXT: v_readfirstlane_b32 s20, v2
+ ; GCN-NEXT: v_readfirstlane_b32 s6, v2
; GCN-NEXT: ; implicit-def: $sgpr4
; GCN-NEXT: ; implicit-def: $vgpr3
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
- ; GCN-NEXT: ; implicit-def: $vgpr50
- ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
- ; GCN-NEXT: ; implicit-def: $vgpr49
- ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
- ; GCN-NEXT: ; implicit-def: $vgpr51
- ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
- ; GCN-NEXT: ; implicit-def: $vgpr76
- ; GCN-NEXT: ; implicit-def: $vgpr77
; GCN-NEXT: ; implicit-def: $vgpr78
+ ; GCN-NEXT: ; implicit-def: $vgpr17
+ ; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11
+ ; GCN-NEXT: ; implicit-def: $vgpr72
+ ; GCN-NEXT: ; implicit-def: $vgpr52_vgpr53_vgpr54_vgpr55
+ ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51
+ ; GCN-NEXT: ; implicit-def: $vgpr56_vgpr57_vgpr58_vgpr59
+ ; GCN-NEXT: ; implicit-def: $sgpr16
+ ; GCN-NEXT: ; implicit-def: $sgpr17
; GCN-NEXT: ; implicit-def: $vgpr79
+ ; GCN-NEXT: ; implicit-def: $vgpr86
; GCN-NEXT: ; implicit-def: $vgpr80
- ; GCN-NEXT: ; implicit-def: $vgpr91
- ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
+ ; GCN-NEXT: ; implicit-def: $vgpr85
+ ; GCN-NEXT: v_max_f32_e32 v87, v85, v85
; GCN-NEXT: ; iglp_opt mask(0x00000002)
- ; GCN-NEXT: s_nop 1
- ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3
+ ; GCN-NEXT: s_nop 0
+ ; GCN-NEXT: v_lshl_add_u32 v2, s6, 4, v3
; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: s_lshl_b32 s4, s20, 7
+ ; GCN-NEXT: s_lshl_b32 s4, s6, 7
+ ; GCN-NEXT: ; implicit-def: $vgpr5
+ ; GCN-NEXT: v_add_lshl_u32 v16, v5, s4, 1
; GCN-NEXT: ; implicit-def: $vgpr5
- ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1
- ; GCN-NEXT: v_add_u32_e32 v76, s20, v76
- ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76
+ ; GCN-NEXT: ; implicit-def: $sgpr4
+ ; GCN-NEXT: v_add_u32_e32 v5, v5, v78
+ ; GCN-NEXT: ; kill: killed $vgpr5
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v48, v[0:3]
+ ; GCN-NEXT: ds_write_b128 v16, v[0:3]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen offset:64 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr0
- ; GCN-NEXT: ; implicit-def: $vgpr1
- ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: ; implicit-def: $sgpr6
- ; GCN-NEXT: v_add_u32_e32 v0, v0, v50
- ; GCN-NEXT: v_add_u32_e32 v1, v1, v50
- ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
+ ; GCN-NEXT: ; implicit-def: $vgpr4
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GCN-NEXT: v_add_u32_e32 v4, v4, v78
+ ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v4, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[74:75], v1, s[16:19], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[70:71], v5, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: ds_read_b128 v[36:39], v49
+ ; GCN-NEXT: ; kill: killed $vgpr4
+ ; GCN-NEXT: ds_read_b128 v[4:7], v17
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[44:47], v49 offset:512
+ ; GCN-NEXT: ds_read_b128 v[12:15], v17 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
- ; GCN-NEXT: ; kill: killed $vgpr1
- ; GCN-NEXT: ; kill: killed $vgpr0
- ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6
- ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1
- ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76
- ; GCN-NEXT: ; implicit-def: $sgpr5
- ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77
- ; GCN-NEXT: ; implicit-def: $sgpr2
- ; GCN-NEXT: ; implicit-def: $sgpr3
- ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78
- ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[36:39], v51
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[10:11], v[32:47]
+ ; GCN-NEXT: ds_read_b128 v[4:7], v72
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
- ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512
+ ; GCN-NEXT: ds_read_b128 v[64:67], v72 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b128 v48, v[32:35]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31]
+ ; GCN-NEXT: ds_write_b128 v16, v[0:3]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[52:53], v[32:47]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[32:35], v49
+ ; GCN-NEXT: ds_read_b128 v[0:3], v17
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
- ; GCN-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
- ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512
+ ; GCN-NEXT: ds_read_b128 v[60:63], v17 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[68:71], v51
+ ; GCN-NEXT: ; implicit-def: $vgpr4
+ ; GCN-NEXT: ; implicit-def: $vgpr5
+ ; GCN-NEXT: v_add_u32_e32 v5, v5, v78
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[54:55], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr6
+ ; GCN-NEXT: v_add_u32_e32 v6, v6, v78
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[48:49], v[32:47]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[50:51], v[32:47]
+ ; GCN-NEXT: ds_read_b128 v[0:3], v72
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
- ; GCN-NEXT: ; implicit-def: $vgpr32
- ; GCN-NEXT: ; implicit-def: $vgpr33
- ; GCN-NEXT: v_add_u32_e32 v82, v32, v50
- ; GCN-NEXT: v_add_u32_e32 v83, v33, v50
- ; GCN-NEXT: ; kill: killed $vgpr82
- ; GCN-NEXT: ; kill: killed $vgpr83
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
- ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[56:57], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr0
+ ; GCN-NEXT: ; implicit-def: $vgpr1
+ ; GCN-NEXT: v_add_u32_e32 v0, s6, v0
+ ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
+ ; GCN-NEXT: v_mul_lo_u32 v0, v0, s4
+ ; GCN-NEXT: v_add_lshl_u32 v81, v1, v0, 1
+ ; GCN-NEXT: v_perm_b32 v0, v70, v68, s16
+ ; GCN-NEXT: v_perm_b32 v1, v70, v68, s17
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[58:59], v[32:47]
+ ; GCN-NEXT: ; implicit-def: $vgpr2
+ ; GCN-NEXT: v_lshl_add_u32 v82, v2, 1, v81
+ ; GCN-NEXT: ; implicit-def: $vgpr3
+ ; GCN-NEXT: v_lshl_add_u32 v83, v3, 1, v82
+ ; GCN-NEXT: v_lshl_add_u32 v84, v4, 1, v83
+ ; GCN-NEXT: v_perm_b32 v2, v71, v69, s16
+ ; GCN-NEXT: v_perm_b32 v3, v71, v69, s17
+ ; GCN-NEXT: ds_read_b128 v[68:71], v72 offset:512
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
- ; GCN-NEXT: ; implicit-def: $vgpr66
- ; GCN-NEXT: ; implicit-def: $vgpr67
- ; GCN-NEXT: v_max_f32_e32 v81, v67, v67
- ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
- ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2
- ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3
- ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[12:13], v[8:9], 0
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b32 v76, v70
+ ; GCN-NEXT: ds_write_b32 v81, v0
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b32 v77, v71
+ ; GCN-NEXT: ds_write_b32 v82, v1
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b32 v78, v72
- ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
- ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v17
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19
- ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65
- ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21
- ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23
- ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80
- ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24
- ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25
- ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v26
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v27
- ; GCN-NEXT: v_max3_f32 v64, v64, v86, v87
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28
- ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29
- ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68
- ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31
- ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0
- ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1
- ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84
- ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3
- ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v4
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5
- ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65
- ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6
- ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7
- ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69
- ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8
- ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9
- ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80
- ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10
- ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11
- ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85
- ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12
- ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13
- ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65
- ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14
- ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15
- ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68
- ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74
- ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64
- ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3
+ ; GCN-NEXT: ds_write_b32 v83, v2
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b32 v79, v68
- ; GCN-NEXT: ; implicit-def: $vgpr84
- ; GCN-NEXT: v_max_f32_e32 v65, v65, v65
- ; GCN-NEXT: v_max_f32_e32 v70, v64, v65
+ ; GCN-NEXT: ds_write_b32 v84, v3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+ ; GCN-NEXT: buffer_load_dwordx2 v[76:77], v5, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[14:15], v[10:11], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[52:53], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[54:55], v[16:31]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[60:61], v[48:49], v[16:31]
+ ; GCN-NEXT: buffer_load_dwordx2 v[60:61], v6, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: ; implicit-def: $sgpr2
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v34
+ ; GCN-NEXT: v_mul_f32_e32 v89, s4, v35
+ ; GCN-NEXT: v_mul_f32_e32 v90, s4, v36
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1]
- ; GCN-NEXT: v_max_f32_e32 v70, v70, v70
- ; GCN-NEXT: v_max_f32_e32 v72, v81, v70
- ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72
- ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72
- ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72
- ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16
- ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18
- ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19
- ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72
- ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72
- ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72
- ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72
- ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72
- ; GCN-NEXT: v_exp_f32_e32 v73, v16
- ; GCN-NEXT: v_exp_f32_e32 v74, v18
- ; GCN-NEXT: v_exp_f32_e32 v75, v19
- ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20
- ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21
- ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22
- ; GCN-NEXT: v_exp_f32_e32 v80, v20
- ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73
- ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72
- ; GCN-NEXT: v_exp_f32_e32 v81, v21
- ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74
- ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72
- ; GCN-NEXT: v_exp_f32_e32 v82, v22
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17
- ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23
- ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72
- ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22
- ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18
- ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72
- ; GCN-NEXT: v_exp_f32_e32 v83, v23
- ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72
- ; GCN-NEXT: v_exp_f32_e32 v85, v22
- ; GCN-NEXT: v_exp_f32_e32 v17, v17
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24
- ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20
- ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17
- ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72
- ; GCN-NEXT: v_exp_f32_e32 v88, v23
- ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72
- ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19
- ; GCN-NEXT: ds_read_b128 v[18:21], v84
+ ; GCN-NEXT: ds_read_b128 v[72:75], v86
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v16, v24
- ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[62:63], v[50:51], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v62, s4, v32
+ ; GCN-NEXT: v_mul_f32_e32 v63, s4, v33
+ ; GCN-NEXT: v_max3_f32 v62, v62, s2, v63
+ ; GCN-NEXT: v_mul_f32_e32 v63, s4, v37
+ ; GCN-NEXT: v_max3_f32 v62, v62, v88, v89
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v38
+ ; GCN-NEXT: v_mul_f32_e32 v89, s4, v39
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[56:57], v[16:31]
+ ; GCN-NEXT: v_max3_f32 v62, v62, v90, v63
+ ; GCN-NEXT: v_mul_f32_e32 v63, s4, v40
+ ; GCN-NEXT: v_mul_f32_e32 v90, s4, v41
+ ; GCN-NEXT: v_max3_f32 v62, v62, v88, v89
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v42
+ ; GCN-NEXT: v_mul_f32_e32 v89, s4, v43
+ ; GCN-NEXT: v_max3_f32 v62, v62, v63, v90
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[58:59], v[16:31]
+ ; GCN-NEXT: v_mul_f32_e32 v63, s4, v44
+ ; GCN-NEXT: v_mul_f32_e32 v56, s4, v45
+ ; GCN-NEXT: v_max3_f32 v57, v62, v88, v89
+ ; GCN-NEXT: v_mul_f32_e32 v62, s4, v46
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v47
+ ; GCN-NEXT: v_max3_f32 v56, v57, v63, v56
+ ; GCN-NEXT: v_max3_f32 v56, v56, v62, v68
+ ; GCN-NEXT: s_nop 3
+ ; GCN-NEXT: v_mul_f32_e32 v58, s4, v16
+ ; GCN-NEXT: v_mul_f32_e32 v59, s4, v17
+ ; GCN-NEXT: v_mul_f32_e32 v62, s4, v18
+ ; GCN-NEXT: v_mul_f32_e32 v63, s4, v19
+ ; GCN-NEXT: v_max3_f32 v56, v56, v58, v59
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v20
+ ; GCN-NEXT: v_mul_f32_e32 v69, s4, v21
+ ; GCN-NEXT: v_max3_f32 v56, v56, v62, v63
+ ; GCN-NEXT: v_mul_f32_e32 v70, s4, v22
+ ; GCN-NEXT: v_mul_f32_e32 v71, s4, v23
+ ; GCN-NEXT: v_max3_f32 v56, v56, v68, v69
+ ; GCN-NEXT: v_mul_f32_e32 v88, s4, v24
+ ; GCN-NEXT: v_mul_f32_e32 v89, s4, v25
+ ; GCN-NEXT: v_max3_f32 v56, v56, v70, v71
+ ; GCN-NEXT: v_mul_f32_e32 v90, s4, v26
+ ; GCN-NEXT: v_mul_f32_e32 v58, s4, v27
+ ; GCN-NEXT: v_max3_f32 v56, v56, v88, v89
+ ; GCN-NEXT: v_mul_f32_e32 v59, s4, v28
+ ; GCN-NEXT: v_mul_f32_e32 v62, s4, v29
+ ; GCN-NEXT: v_max3_f32 v56, v56, v90, v58
+ ; GCN-NEXT: v_mul_f32_e32 v63, s4, v30
+ ; GCN-NEXT: v_mul_f32_e32 v68, s4, v31
+ ; GCN-NEXT: v_max3_f32 v56, v56, v59, v62
+ ; GCN-NEXT: v_max3_f32 v56, v56, v63, v68
+ ; GCN-NEXT: ds_bpermute_b32 v58, v79, v56
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_read_b128 v[52:55], v86 offset:576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
- ; GCN-NEXT: v_add_f32_e32 v18, 0, v73
- ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83
- ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72
- ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80
- ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72
- ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
- ; GCN-NEXT: v_add_f32_e32 v17, v17, v18
- ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26
- ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81
- ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72
- ; GCN-NEXT: v_exp_f32_e32 v30, v18
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82
- ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72
- ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2
- ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3
- ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3
- ; GCN-NEXT: ds_read_b128 v[26:29], v91
+ ; GCN-NEXT: ds_read_b128 v[64:67], v80
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: ds_read_b128 v[68:71], v91 offset:576
+ ; GCN-NEXT: ds_read_b128 v[48:51], v80 offset:576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1
+ ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: v_perm_b32 v57, v60, v76, s16
; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: ds_write_b32 v76, v31
- ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67
- ; GCN-NEXT: v_exp_f32_e32 v31, v31
- ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v18
- ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86
- ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89
+ ; GCN-NEXT: ds_write_b32 v81, v57
+ ; GCN-NEXT: v_max_f32_e32 v57, v58, v58
+ ; GCN-NEXT: v_max_f32_e32 v56, v56, v57
+ ; GCN-NEXT: ds_bpermute_b32 v57, v79, v56
+ ; GCN-NEXT: v_perm_b32 v59, v60, v76, s17
+ ; GCN-NEXT: v_perm_b32 v60, v61, v77, s16
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b32 v77, v64
+ ; GCN-NEXT: ds_write_b32 v82, v59
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b32 v78, v90
+ ; GCN-NEXT: ds_write_b32 v83, v60
+ ; GCN-NEXT: v_cndmask_b32_e64 v56, v57, v56, s[0:1]
+ ; GCN-NEXT: v_max_f32_e32 v56, v56, v56
+ ; GCN-NEXT: v_max_f32_e32 v56, v87, v56
+ ; GCN-NEXT: v_fma_f32 v32, s4, v32, -v56
+ ; GCN-NEXT: v_fma_f32 v34, s4, v34, -v56
+ ; GCN-NEXT: v_fma_f32 v35, s4, v35, -v56
+ ; GCN-NEXT: v_mul_f32_e32 v32, 0x3fb8aa3b, v32
+ ; GCN-NEXT: v_mul_f32_e32 v34, 0x3fb8aa3b, v34
+ ; GCN-NEXT: v_mul_f32_e32 v35, 0x3fb8aa3b, v35
+ ; GCN-NEXT: v_fma_f32 v33, s4, v33, -v56
+ ; GCN-NEXT: v_fma_f32 v36, s4, v36, -v56
+ ; GCN-NEXT: v_fma_f32 v37, s4, v37, -v56
+ ; GCN-NEXT: v_fma_f32 v38, s4, v38, -v56
+ ; GCN-NEXT: v_fma_f32 v39, s4, v39, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v58, v32
+ ; GCN-NEXT: v_exp_f32_e32 v59, v34
+ ; GCN-NEXT: v_exp_f32_e32 v60, v35
+ ; GCN-NEXT: v_perm_b32 v61, v61, v77, s17
+ ; GCN-NEXT: v_mul_f32_e32 v36, 0x3fb8aa3b, v36
+ ; GCN-NEXT: v_mul_f32_e32 v37, 0x3fb8aa3b, v37
+ ; GCN-NEXT: v_mul_f32_e32 v38, 0x3fb8aa3b, v38
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b32 v79, v65
- ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73
- ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
- ; GCN-NEXT: v_add_f32_e32 v17, v74, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85
- ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72
- ; GCN-NEXT: v_exp_f32_e32 v22, v64
- ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88
- ; GCN-NEXT: v_exp_f32_e32 v64, v65
+ ; GCN-NEXT: ds_write_b32 v84, v61
+ ; GCN-NEXT: v_exp_f32_e32 v61, v36
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v34, v58
+ ; GCN-NEXT: v_fma_f32 v35, s4, v40, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v62, v37
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v59
+ ; GCN-NEXT: v_fma_f32 v40, s4, v41, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v63, v38
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v60
+ ; GCN-NEXT: v_mul_f32_e32 v39, 0x3fb8aa3b, v39
+ ; GCN-NEXT: v_fma_f32 v41, s4, v42, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v68, v39
+ ; GCN-NEXT: v_mul_f32_e32 v39, 0x3fb8aa3b, v35
+ ; GCN-NEXT: v_pack_b32_f16 v35, v37, v38
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v37, v61
+ ; GCN-NEXT: v_fma_f32 v42, s4, v43, -v56
+ ; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v33
+ ; GCN-NEXT: v_mul_f32_e32 v42, 0x3fb8aa3b, v42
+ ; GCN-NEXT: v_exp_f32_e32 v71, v42
+ ; GCN-NEXT: v_exp_f32_e32 v33, v33
+ ; GCN-NEXT: v_sub_f32_e32 v57, v85, v56
+ ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+ ; GCN-NEXT: v_exp_f32_e32 v32, v57
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v36, v33
+ ; GCN-NEXT: v_fma_f32 v43, s4, v44, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v44, v39
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v63
+ ; GCN-NEXT: v_fma_f32 v45, s4, v45, -v56
+ ; GCN-NEXT: v_pack_b32_f16 v34, v34, v36
+ ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mul_f32_e32 v45, 0x3fb8aa3b, v45
+ ; GCN-NEXT: v_exp_f32_e32 v45, v45
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[34:35], v[0:15]
+ ; GCN-NEXT: v_mul_f32_e32 v40, 0x3fb8aa3b, v40
+ ; GCN-NEXT: v_add_f32_e32 v36, 0, v58
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v70, v68
+ ; GCN-NEXT: v_fma_f32 v47, s4, v47, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v58, v40
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v62
+ ; GCN-NEXT: v_mul_f32_e32 v47, 0x3fb8aa3b, v47
+ ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v47, v47
+ ; GCN-NEXT: v_add_f32_e32 v33, v33, v36
+ ; GCN-NEXT: v_fma_f32 v36, s4, v46, -v56
+ ; GCN-NEXT: v_mul_f32_e32 v41, 0x3fb8aa3b, v41
+ ; GCN-NEXT: v_mul_f32_e32 v43, 0x3fb8aa3b, v43
+ ; GCN-NEXT: v_mul_f32_e32 v42, 0x3fb8aa3b, v36
+ ; GCN-NEXT: v_pack_b32_f16 v36, v37, v57
+ ; GCN-NEXT: v_pack_b32_f16 v37, v69, v70
+ ; GCN-NEXT: v_exp_f32_e32 v46, v41
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v44
+ ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v56
+ ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v16
+ ; GCN-NEXT: v_exp_f32_e32 v70, v42
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[36:37], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v33, v59, v33
+ ; GCN-NEXT: v_fma_f32 v16, s4, v18, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v59, v43
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v43, v58
+ ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v56
+ ; GCN-NEXT: v_mul_f32_e32 v42, 0x3fb8aa3b, v16
+ ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19
+ ; GCN-NEXT: v_pack_b32_f16 v16, v57, v43
+ ; GCN-NEXT: v_exp_f32_e32 v57, v69
+ ; GCN-NEXT: v_add_f32_e32 v33, v60, v33
+ ; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v17
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v46
+ ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v60, v60
+ ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v56
+ ; GCN-NEXT: v_add_f32_e32 v33, v61, v33
+ ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v56
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v71
+ ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v56
; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
- ; GCN-NEXT: v_add_f32_e32 v17, v75, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30
- ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72
- ; GCN-NEXT: v_exp_f32_e32 v23, v23
- ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31
- ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0
- ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1
- ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21
- ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19
- ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72
- ; GCN-NEXT: v_exp_f32_e32 v25, v67
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
- ; GCN-NEXT: v_add_f32_e32 v17, v80, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22
- ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72
- ; GCN-NEXT: v_exp_f32_e32 v27, v3
- ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64
- ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72
- ; GCN-NEXT: v_exp_f32_e32 v65, v65
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
- ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
- ; GCN-NEXT: v_add_f32_e32 v17, v81, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23
- ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72
- ; GCN-NEXT: v_exp_f32_e32 v68, v2
- ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25
+ ; GCN-NEXT: v_exp_f32_e32 v72, v23
+ ; GCN-NEXT: v_pack_b32_f16 v17, v17, v18
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_read_b128 v[0:3], v84
+ ; GCN-NEXT: ds_read_b128 v[38:41], v86
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4
- ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19
- ; GCN-NEXT: v_exp_f32_e32 v24, v24
- ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576
+ ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[16:17], v[0:15]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v59
+ ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v64, v42
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v43, v45
+ ; GCN-NEXT: v_fma_f32 v25, s4, v25, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v65, v19
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v70
+ ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v56
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v42, v47
+ ; GCN-NEXT: v_pack_b32_f16 v18, v18, v43
+ ; GCN-NEXT: v_fma_f32 v69, s4, v27, -v56
+ ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+ ; GCN-NEXT: v_pack_b32_f16 v19, v61, v42
+ ; GCN-NEXT: v_exp_f32_e32 v61, v20
+ ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[18:19], v[0:15]
+ ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v66, v21
+ ; GCN-NEXT: v_fma_f32 v29, s4, v29, -v56
+ ; GCN-NEXT: v_exp_f32_e32 v67, v22
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v42, v57
+ ; GCN-NEXT: v_fma_f32 v30, s4, v30, -v56
+ ; GCN-NEXT: v_fma_f32 v31, s4, v31, -v56
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v64
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v65
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v66
+ ; GCN-NEXT: v_pack_b32_f16 v42, v42, v20
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v75, v67
+ ; GCN-NEXT: v_pack_b32_f16 v43, v21, v22
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v72
+ ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v24
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[38:39], v[42:43], v[0:15]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v61
+ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v25
+ ; GCN-NEXT: v_mul_f32_e32 v25, 0x3fb8aa3b, v26
+ ; GCN-NEXT: v_mul_f32_e32 v56, 0x3fb8aa3b, v69
+ ; GCN-NEXT: v_mul_f32_e32 v69, 0x3fb8aa3b, v28
+ ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v29
+ ; GCN-NEXT: v_mul_f32_e32 v30, 0x3fb8aa3b, v30
+ ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v31
+ ; GCN-NEXT: v_pack_b32_f16 v28, v38, v73
+ ; GCN-NEXT: v_pack_b32_f16 v29, v75, v77
+ ; GCN-NEXT: ; implicit-def: $sgpr0
+ ; GCN-NEXT: v_add_u32_e32 v78, s0, v78
+ ; GCN-NEXT: s_nop 0
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[28:29], v[0:15]
+ ; GCN-NEXT: v_exp_f32_e32 v40, v69
+ ; GCN-NEXT: v_exp_f32_e32 v69, v81
+ ; GCN-NEXT: v_exp_f32_e32 v81, v31
+ ; GCN-NEXT: v_exp_f32_e32 v74, v24
+ ; GCN-NEXT: v_exp_f32_e32 v76, v25
+ ; GCN-NEXT: v_exp_f32_e32 v56, v56
+ ; GCN-NEXT: v_exp_f32_e32 v75, v30
+ ; GCN-NEXT: v_exp_f32_e32 v39, v23
+ ; GCN-NEXT: ds_read_b128 v[20:23], v86 offset:576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26
- ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
- ; GCN-NEXT: v_add_f32_e32 v17, v82, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27
- ; GCN-NEXT: v_exp_f32_e32 v26, v26
- ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65
- ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72
- ; GCN-NEXT: v_exp_f32_e32 v67, v67
- ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
- ; GCN-NEXT: v_add_f32_e32 v17, v83, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68
- ; GCN-NEXT: v_exp_f32_e32 v6, v6
- ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24
- ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7
- ; GCN-NEXT: v_exp_f32_e32 v7, v7
- ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29
- ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69
- ; GCN-NEXT: ; implicit-def: $sgpr2
- ; GCN-NEXT: s_nop 1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
- ; GCN-NEXT: v_add_f32_e32 v0, v85, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26
- ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
- ; GCN-NEXT: v_add_f32_e32 v4, v88, v0
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6
- ; GCN-NEXT: v_exp_f32_e32 v10, v0
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7
- ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0
- ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28
- ; GCN-NEXT: s_nop 1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
- ; GCN-NEXT: v_add_f32_e32 v2, v30, v4
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
- ; GCN-NEXT: v_add_f32_e32 v0, v31, v2
- ; GCN-NEXT: v_add_f32_e32 v0, v22, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v64, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v23, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v25, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v27, v0
- ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72
- ; GCN-NEXT: v_add_f32_e32 v0, v65, v0
- ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72
- ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8
- ; GCN-NEXT: v_add_f32_e32 v0, v68, v0
- ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72
- ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9
- ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72
- ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72
- ; GCN-NEXT: v_exp_f32_e32 v8, v8
- ; GCN-NEXT: v_add_f32_e32 v0, v24, v0
- ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72
- ; GCN-NEXT: v_exp_f32_e32 v9, v9
- ; GCN-NEXT: v_add_f32_e32 v0, v26, v0
- ; GCN-NEXT: v_add_f32_e32 v0, v67, v0
- ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72
- ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11
- ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5
- ; GCN-NEXT: v_add_f32_e32 v0, v6, v0
- ; GCN-NEXT: v_exp_f32_e32 v11, v11
- ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8
- ; GCN-NEXT: v_exp_f32_e32 v12, v3
- ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13
- ; GCN-NEXT: v_exp_f32_e32 v17, v1
- ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14
- ; GCN-NEXT: v_add_f32_e32 v0, v7, v0
- ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9
- ; GCN-NEXT: v_exp_f32_e32 v15, v3
- ; GCN-NEXT: v_exp_f32_e32 v18, v1
- ; GCN-NEXT: v_add_f32_e32 v6, v8, v0
- ; GCN-NEXT: ds_read_b128 v[0:3], v91
+ ; GCN-NEXT: ds_read_b128 v[24:27], v80
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10
- ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11
- ; GCN-NEXT: v_add_f32_e32 v6, v9, v6
- ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13
- ; GCN-NEXT: v_add_f32_e32 v6, v10, v6
- ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14
- ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18
- ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17
- ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v12
- ; GCN-NEXT: v_add_f32_e32 v6, v11, v6
- ; GCN-NEXT: v_add_f32_e32 v6, v12, v6
- ; GCN-NEXT: v_add_f32_e32 v1, v15, v6
- ; GCN-NEXT: v_add_f32_e32 v11, v17, v1
- ; GCN-NEXT: v_pack_b32_f16 v1, v0, v7
- ; GCN-NEXT: v_pack_b32_f16 v0, v4, v10
- ; GCN-NEXT: ds_read_b128 v[4:7], v91 offset:576
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v41, v74
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v39
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v76
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v77, v56
+ ; GCN-NEXT: v_pack_b32_f16 v30, v38, v41
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v38, v81
+ ; GCN-NEXT: v_pack_b32_f16 v31, v73, v77
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v41, v69
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v40
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[24:25], v[30:31], v[0:15]
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v24, v75
+ ; GCN-NEXT: v_pack_b32_f16 v25, v24, v38
+ ; GCN-NEXT: v_pack_b32_f16 v24, v73, v41
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[26:27], v[24:25], v[0:15]
+ ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: s_nop 10
+ ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[32:33] op_sel_hi:[1,0]
+ ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[52:53], v[34:35], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[54:55], v[36:37], v[0:15]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[48:49], v[16:17], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v62, v33
+ ; GCN-NEXT: v_add_f32_e32 v16, v63, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v68, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v44, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v58, v16
+ ; GCN-NEXT: v_add_f32_e32 v16, v46, v16
+ ; GCN-NEXT: v_add_f32_e32 v26, v71, v16
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[50:51], v[18:19], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v26, v59, v26
+ ; GCN-NEXT: v_add_f32_e32 v26, v45, v26
+ ; GCN-NEXT: v_add_f32_e32 v26, v70, v26
+ ; GCN-NEXT: v_add_f32_e32 v26, v47, v26
+ ; GCN-NEXT: v_add_f32_e32 v26, v57, v26
+ ; GCN-NEXT: ds_read_b128 v[16:19], v80 offset:576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mov_b32_e32 v4, 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
- ; GCN-NEXT: v_add_f32_e32 v2, v18, v11
- ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[20:21], v[42:43], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v20, v60, v26
+ ; GCN-NEXT: v_add_f32_e32 v20, v64, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v65, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v61, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v66, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v67, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v72, v20
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[22:23], v[28:29], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v20, v39, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v74, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v76, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v56, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v40, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v69, v20
+ ; GCN-NEXT: v_add_f32_e32 v20, v75, v20
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[30:31], v[0:15]
+ ; GCN-NEXT: v_add_f32_e32 v16, v81, v20
+ ; GCN-NEXT: ds_bpermute_b32 v17, v79, v16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: v_add_f32_e32 v2, v2, v3
- ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2
+ ; GCN-NEXT: v_add_f32_e32 v16, v16, v17
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[24:25], v[0:15]
+ ; GCN-NEXT: s_nop 10
+ ; GCN-NEXT: ds_bpermute_b32 v0, v79, v16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
- ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16
+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_endpgm
attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
diff --git a/llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir b/llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir
new file mode 100644
index 0000000000000..8721c32688571
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir
@@ -0,0 +1,118 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-scheduler -verify-machineinstrs -amdgpu-disable-mfma-chain-order-deps %s -o - 2>&1 | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-scheduler -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=CHAIN
+
+---
+name: test_fmha_order
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $exec
+ ; CHECK-LABEL: name: test_fmha_order
+ ; CHECK: liveins: $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %addr:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: %c0:vreg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: %c1:vreg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: %accA:vreg_512_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: %accB:vreg_512_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: %accC:vreg_512_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: %accD:vreg_512_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: SCHED_BARRIER 0, implicit $exec
+ ; CHECK-NEXT: %t0a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9216, 0, implicit $exec
+ ; CHECK-NEXT: %t0b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9248, 0, implicit $exec
+ ; CHECK-NEXT: %t1a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11392, 0, implicit $exec
+ ; CHECK-NEXT: %t1b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11424, 0, implicit $exec
+ ; CHECK-NEXT: %t2a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13568, 0, implicit $exec
+ ; CHECK-NEXT: %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub0_sub1, %c0.sub0_sub1, %accC, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub2_sub3, %c0.sub2_sub3, %accC, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %t2b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13600, 0, implicit $exec
+ ; CHECK-NEXT: %t3a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15744, 0, implicit $exec
+ ; CHECK-NEXT: %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub0_sub1, %c0.sub0_sub1, %accD, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub2_sub3, %c0.sub2_sub3, %accD, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub0_sub1, %c0.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub2_sub3, %c0.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub0_sub1, %c0.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub2_sub3, %c0.sub2_sub3, %accB, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub0_sub1, %c1.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub2_sub3, %c1.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %t3b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15776, 0, implicit $exec
+ ; CHECK-NEXT: SCHED_BARRIER 1, implicit $exec
+ ; CHECK-NEXT: dead %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1b.sub0_sub1, %c1.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $exec
+ ;
+ ; CHAIN-LABEL: name: test_fmha_order
+ ; CHAIN: liveins: $exec
+ ; CHAIN-NEXT: {{ $}}
+ ; CHAIN-NEXT: %addr:vgpr_32 = IMPLICIT_DEF
+ ; CHAIN-NEXT: %c0:vreg_128_align2 = IMPLICIT_DEF
+ ; CHAIN-NEXT: %c1:vreg_128_align2 = IMPLICIT_DEF
+ ; CHAIN-NEXT: %accA:vreg_512_align2 = IMPLICIT_DEF
+ ; CHAIN-NEXT: %accB:vreg_512_align2 = IMPLICIT_DEF
+ ; CHAIN-NEXT: %accC:vreg_512_align2 = IMPLICIT_DEF
+ ; CHAIN-NEXT: %accD:vreg_512_align2 = IMPLICIT_DEF
+ ; CHAIN-NEXT: SCHED_BARRIER 0, implicit $exec
+ ; CHAIN-NEXT: %t0a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9216, 0, implicit $exec
+ ; CHAIN-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub0_sub1, %c0.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub2_sub3, %c0.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: %t0b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9248, 0, implicit $exec
+ ; CHAIN-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub0_sub1, %c1.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: dead %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub2_sub3, %c1.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: %t1a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11392, 0, implicit $exec
+ ; CHAIN-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub0_sub1, %c0.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub2_sub3, %c0.sub2_sub3, %accB, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: %t1b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11424, 0, implicit $exec
+ ; CHAIN-NEXT: dead %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1b.sub0_sub1, %c1.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: %t2a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13568, 0, implicit $exec
+ ; CHAIN-NEXT: %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub0_sub1, %c0.sub0_sub1, %accC, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: dead %t2b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13600, 0, implicit $exec
+ ; CHAIN-NEXT: dead %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub2_sub3, %c0.sub2_sub3, %accC, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: %t3a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15744, 0, implicit $exec
+ ; CHAIN-NEXT: %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub0_sub1, %c0.sub0_sub1, %accD, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: dead %t3b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15776, 0, implicit $exec
+ ; CHAIN-NEXT: SCHED_BARRIER 1, implicit $exec
+ ; CHAIN-NEXT: dead %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub2_sub3, %c0.sub2_sub3, %accD, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHAIN-NEXT: S_ENDPGM 0, implicit $exec
+ %addr:vgpr_32 = IMPLICIT_DEF
+ %c0:vreg_128_align2 = IMPLICIT_DEF
+ %c1:vreg_128_align2 = IMPLICIT_DEF
+ %accA:vreg_512_align2 = IMPLICIT_DEF
+ %accB:vreg_512_align2 = IMPLICIT_DEF
+ %accC:vreg_512_align2 = IMPLICIT_DEF
+ %accD:vreg_512_align2 = IMPLICIT_DEF
+
+ SCHED_BARRIER 0, implicit $exec
+
+ %t0a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9216, 0, implicit $exec
+ %t0b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9248, 0, implicit $exec
+ %t1a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11392, 0, implicit $exec
+ %t1b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11424, 0, implicit $exec
+ %t2a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13568, 0, implicit $exec
+ %t2b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13600, 0, implicit $exec
+ %t3a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15744, 0, implicit $exec
+ %t3b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15776, 0, implicit $exec
+
+ %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub0_sub1, %c0.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub2_sub3, %c0.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+
+ %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub0_sub1, %c0.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+ %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub2_sub3, %c0.sub2_sub3, %accB, 0, 0, 0, implicit $mode, implicit $exec
+
+ %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub0_sub1, %c0.sub0_sub1, %accC, 0, 0, 0, implicit $mode, implicit $exec
+ %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub2_sub3, %c0.sub2_sub3, %accC, 0, 0, 0, implicit $mode, implicit $exec
+
+ %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub0_sub1, %c0.sub0_sub1, %accD, 0, 0, 0, implicit $mode, implicit $exec
+ %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub2_sub3, %c0.sub2_sub3, %accD, 0, 0, 0, implicit $mode, implicit $exec
+
+ %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub0_sub1, %c1.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec
+ %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub2_sub3, %c1.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec
+
+ %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1b.sub0_sub1, %c1.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec
+ SCHED_BARRIER 1, implicit $exec
+ S_ENDPGM 0, implicit $exec
+
+...
+
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# CHAIN: {{.*}}
+# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index e29be2b744874..3f1b6826444a5 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -369,74 +369,70 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_mov_b32 s5, s4
-; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[4:5]
+; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[16:19]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mov_b64_e32 v[4:5], 0
; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3]
-; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[0:3]
+; CHECK-NEXT: v_mov_b64_e32 v[20:21], s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00
; CHECK-NEXT: s_mov_b32 s1, s0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7]
-; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9]
-; CHECK-NEXT: s_nop 3
-; CHECK-NEXT: v_cvt_f16_f32_e32 v24, v4
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3]
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mov_b32_e32 v8, 0x7fc00000
-; CHECK-NEXT: v_mov_b32_e32 v9, v8
-; CHECK-NEXT: v_mov_b32_e32 v10, v8
-; CHECK-NEXT: v_mov_b32_e32 v11, v8
-; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6
-; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
-; CHECK-NEXT: global_store_short v[0:1], v2, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[8:11]
+; CHECK-NEXT: v_mov_b64_e32 v[22:23], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[20:21], v[0:3]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[6:7], v[10:13]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[6:7], v[22:23], v[0:3]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_nop 5
+; CHECK-NEXT: v_mov_b32_e32 v12, 0x7fc00000
+; CHECK-NEXT: v_mov_b32_e32 v13, v12
+; CHECK-NEXT: v_mov_b32_e32 v14, v12
+; CHECK-NEXT: v_mov_b32_e32 v15, v12
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[6:7], v[6:7], v[16:19]
+; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v10
+; CHECK-NEXT: global_store_short v[4:5], v9, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[6:7], v[12:15]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11]
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[6:7], v[10:13]
+; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v16
; CHECK-NEXT: s_nop 5
-; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15]
-; CHECK-NEXT: global_store_short v[0:1], v10, off
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5]
+; CHECK-NEXT: v_cvt_f16_f32_e32 v12, v8
+; CHECK-NEXT: v_cvt_f16_f32_e32 v13, v10
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[0:3]
+; CHECK-NEXT: global_store_short v[4:5], v13, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[8:11]
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: s_nop 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CHECK-NEXT: global_store_short v[0:1], v6, off
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23]
+; CHECK-NEXT: global_store_short v[4:5], v14, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: global_store_short v[0:1], v24, off
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[22:23], v[6:7], v[8:11]
+; CHECK-NEXT: global_store_short v[4:5], v12, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[20:21], v[0:3]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[6:7], v[0:3]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[20:21], v[6:7], v[0:3]
; CHECK-NEXT: s_nop 6
-; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2
-; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19]
-; CHECK-NEXT: global_store_short v[0:1], v6, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8
+; CHECK-NEXT: global_store_short v[4:5], v0, off
; CHECK-NEXT: buffer_wbl2 sc0 sc1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_inv sc0 sc1
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT: global_store_short v[0:1], v2, off
+; CHECK-NEXT: global_store_short v[4:5], v1, off
; CHECK-NEXT: s_endpgm
entry:
%k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
More information about the llvm-commits
mailing list