[llvm] [LLVM][AMDGPU] extend IGLP (PR #135846)
Maksim Levental via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 17 13:17:22 PDT 2025
https://github.com/makslevental updated https://github.com/llvm/llvm-project/pull/135846
>From d1a8c56fee69a393e24f08602b624b2b233c44ab Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental at gmail.com>
Date: Tue, 15 Apr 2025 16:10:17 -0400
Subject: [PATCH] [LLVM][AMDGPU] extend IGLP
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 110 +-
llvm/test/CodeGen/AMDGPU/4_tlp_fast.amdgcn | 4361 +++++++++++++
llvm/test/CodeGen/AMDGPU/4_tlp_fast.llir | 5722 +++++++++++++++++
.../AMDGPU/4_tlp_fast_no_barriers.llir | 4774 ++++++++++++++
.../AMDGPU/llvm.amdgcn.iglp.opt.max.ll | 139 +
5 files changed, 15076 insertions(+), 30 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/4_tlp_fast.amdgcn
create mode 100644 llvm/test/CodeGen/AMDGPU/4_tlp_fast.llir
create mode 100644 llvm/test/CodeGen/AMDGPU/4_tlp_fast_no_barriers.llir
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.max.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 153b14ce60507..80ee5dcbc0ef0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -94,9 +94,8 @@ class InstructionRule {
std::optional<SmallVector<SUnit *, 4>> Cache;
public:
- virtual bool
- apply(const SUnit *, const ArrayRef<SUnit *>,
- SmallVectorImpl<SchedGroup> &) {
+ virtual bool apply(const SUnit *, const ArrayRef<SUnit *>,
+ SmallVectorImpl<SchedGroup> &) {
return true;
};
@@ -696,6 +695,76 @@ bool PipelineSolver::solveExact() {
return FinishedExploring;
}
+// Implement a IGLP scheduling strategy.
+class IGLPStrategy {
+protected:
+ ScheduleDAGInstrs *DAG;
+
+ const SIInstrInfo *TII;
+
+public:
+ /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
+ virtual bool applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ AMDGPU::SchedulingPhase Phase) = 0;
+
+ // Returns true if this strategy should be applied to a ScheduleDAG.
+ virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+ AMDGPU::SchedulingPhase Phase) = 0;
+
+ bool IsBottomUp = true;
+
+ IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+ : DAG(DAG), TII(TII) {}
+
+ virtual ~IGLPStrategy() = default;
+};
+
+class MaxsOpt final : public IGLPStrategy {
+private:
+public:
+ bool applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ AMDGPU::SchedulingPhase Phase) override;
+
+ bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+ AMDGPU::SchedulingPhase Phase) override {
+ return true;
+ }
+
+ MaxsOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+ : IGLPStrategy(DAG, TII) {
+ IsBottomUp = true;
+ }
+};
+
+bool MaxsOpt::applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ AMDGPU::SchedulingPhase Phase) {
+ // Count the number of MFMA instructions.
+ unsigned MFMACount = 0;
+ for (const MachineInstr &I : *DAG)
+ if (TII->isMFMAorWMMA(I))
+ ++MFMACount;
+
+ const unsigned PipelineSyncID = 0;
+ SchedGroup *SG = nullptr;
+ for (unsigned I = 0; I < MFMACount * 3; ++I) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ return true;
+}
+
template <typename T>
void PipelineSolver::greedyFind(
std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
@@ -815,33 +884,8 @@ enum IGLPStrategyID : int {
MFMASmallGemmOptID = 0,
MFMASmallGemmSingleWaveOptID = 1,
MFMAExpInterleaveID = 2,
- MFMAExpSimpleInterleaveID = 3
-};
-
-// Implement a IGLP scheduling strategy.
-class IGLPStrategy {
-protected:
- ScheduleDAGInstrs *DAG;
-
- const SIInstrInfo *TII;
-
-public:
- /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
- virtual bool applyIGLPStrategy(
- DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
- DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- AMDGPU::SchedulingPhase Phase) = 0;
-
- // Returns true if this strategy should be applied to a ScheduleDAG.
- virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
- AMDGPU::SchedulingPhase Phase) = 0;
-
- bool IsBottomUp = true;
-
- IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : DAG(DAG), TII(TII) {}
-
- virtual ~IGLPStrategy() = default;
+ MFMAExpSimpleInterleaveID = 3,
+ MaxsID = 4
};
class MFMASmallGemmOpt final : public IGLPStrategy {
@@ -2335,6 +2379,8 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
case MFMAExpSimpleInterleaveID:
return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
+ case MaxsID:
+ return std::make_unique<MaxsOpt>(DAG, TII);
}
llvm_unreachable("Unknown IGLPStrategyID");
@@ -2599,10 +2645,14 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
}
if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
+ // llvm::dbgs() << "before pipeline solver\n";
+ // DAG->dump();
PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
// PipelineSolver performs the mutation by adding the edges it
// determined as the best
PS.solve();
+ // llvm::dbgs() << "after pipeline solver\n";
+ // DAG->dump();
return;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/4_tlp_fast.amdgcn b/llvm/test/CodeGen/AMDGPU/4_tlp_fast.amdgcn
new file mode 100644
index 0000000000000..9e8e5fabec7d1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/4_tlp_fast.amdgcn
@@ -0,0 +1,4361 @@
+ .text
+ .amdgcn_target "amdgcn-amd-amdhsa--gfx942"
+ .amdhsa_code_object_version 4
+ .globl matmul_kernel ; -- Begin function matmul_kernel
+ .p2align 8
+ .type matmul_kernel, at function
+matmul_kernel: ; @matmul_kernel
+.Lfunc_begin0:
+ .cfi_sections .debug_frame
+ .cfi_startproc
+ s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+ .fill 63, 4, 0xbf800000 ; s_nop 0
+; %bb.0:
+ .file 1 "<unknown>"
+ .loc 1 0 0 prologue_end ; <unknown>:0:0
+ s_add_i32 s9, s11, 0xff
+ s_ashr_i32 s11, s9, 31
+ s_lshr_b32 s11, s11, 24
+ s_add_i32 s9, s9, s11
+ s_ashr_i32 s9, s9, 8
+ s_lshl_b32 s11, s9, 2
+ s_abs_i32 s16, s11
+ v_cvt_f32_u32_e32 v1, s16
+ s_ashr_i32 s8, s15, 31
+ s_lshr_b32 s8, s8, 29
+ s_add_i32 s8, s15, s8
+ v_rcp_iflag_f32_e32 v1, v1
+ s_ashr_i32 s8, s8, 3
+ s_sub_i32 s17, 0, s16
+ s_mulk_i32 s15, 0x4c
+ v_mul_f32_e32 v1, 0x4f7ffffe, v1
+ v_cvt_u32_f32_e32 v1, v1
+ s_mulk_i32 s8, 0xfda1
+ s_add_i32 s8, s8, s15
+ s_abs_i32 s15, s8
+ v_readfirstlane_b32 s18, v1
+ s_mul_i32 s17, s17, s18
+ s_mul_hi_u32 s17, s18, s17
+ s_add_i32 s18, s18, s17
+ s_mul_hi_u32 s17, s15, s18
+ s_mul_i32 s18, s17, s16
+ s_xor_b32 s9, s8, s9
+ s_sub_i32 s15, s15, s18
+ s_ashr_i32 s9, s9, 31
+ s_add_i32 s18, s17, 1
+ s_sub_i32 s19, s15, s16
+ s_cmp_ge_u32 s15, s16
+ s_cselect_b32 s17, s18, s17
+ s_cselect_b32 s15, s19, s15
+ s_add_i32 s18, s17, 1
+ s_cmp_ge_u32 s15, s16
+ s_cselect_b32 s15, s18, s17
+ s_addk_i32 s10, 0xff
+ s_ashr_i32 s16, s10, 31
+ s_xor_b32 s15, s15, s9
+ s_lshr_b32 s16, s16, 24
+ s_sub_i32 s9, s15, s9
+ s_add_i32 s10, s10, s16
+ s_lshl_b32 s15, s9, 2
+ s_ashr_i32 s10, s10, 8
+ s_sub_i32 s10, s10, s15
+ s_min_i32 s10, s10, 4
+ s_abs_i32 s16, s10
+ v_cvt_f32_u32_e32 v1, s16
+ s_sub_i32 s17, 0, s16
+ s_mul_i32 s9, s9, s11
+ s_sub_i32 s8, s8, s9
+ v_rcp_iflag_f32_e32 v1, v1
+ s_abs_i32 s11, s8
+ s_xor_b32 s9, s8, s10
+ s_ashr_i32 s9, s9, 31
+ v_mul_f32_e32 v1, 0x4f7ffffe, v1
+ v_cvt_u32_f32_e32 v1, v1
+ v_lshlrev_b32_e32 v130, 3, v0
+ v_and_b32_e32 v2, 56, v130
+ v_xor_b32_e32 v130, v130, v0
+ v_readfirstlane_b32 s18, v1
+ s_mul_i32 s17, s17, s18
+ s_mul_hi_u32 s17, s18, s17
+ s_add_i32 s18, s18, s17
+ s_mul_hi_u32 s17, s11, s18
+ s_mul_i32 s18, s17, s16
+ s_sub_i32 s11, s11, s18
+ s_add_i32 s18, s17, 1
+ s_sub_i32 s19, s11, s16
+ s_cmp_ge_u32 s11, s16
+ s_cselect_b32 s17, s18, s17
+ s_cselect_b32 s11, s19, s11
+ s_add_i32 s18, s17, 1
+ s_cmp_ge_u32 s11, s16
+ s_cselect_b32 s11, s18, s17
+ s_xor_b32 s11, s11, s9
+ s_sub_i32 s11, s11, s9
+ s_mul_i32 s9, s11, s10
+ s_sub_i32 s28, s8, s9
+ s_add_i32 s28, s28, s15
+ s_lshl_b32 s15, s28, 8
+ s_mul_i32 s8, s15, s13
+ s_ashr_i32 s9, s8, 31
+ s_lshl_b32 s10, s13, 5
+ s_lshl_b64 s[8:9], s[8:9], 1
+ s_add_u32 s16, s2, s8
+ s_addc_u32 s17, s3, s9
+ v_lshrrev_b32_e32 v1, 3, v0
+ s_add_u32 s20, s16, 0x80
+ v_mad_u64_u32 v[4:5], s[8:9], s13, v1, v[2:3]
+ s_addc_u32 s21, s17, 0
+ s_add_i32 s29, s12, 63
+ v_add_u32_e32 v3, s10, v4
+ s_cmp_gt_i32 s29, 63
+ v_lshlrev_b32_e32 v155, 1, v4
+ v_bfrev_b32_e32 v4, 1
+ s_cselect_b64 s[8:9], -1, 0
+ v_lshlrev_b32_e32 v156, 1, v3
+ s_and_b32 s17, s17, 0xffff
+ s_mov_b32 s19, 0x27000
+ s_mov_b32 s18, 0x7ffffffe
+ v_cndmask_b32_e64 v5, v4, v155, s[8:9]
+ v_add_u32_e32 v6, s10, v3
+ v_cndmask_b32_e64 v3, v4, v156, s[8:9]
+ buffer_load_dwordx4 v[66:69], v5, s[16:19], 0 offen
+ buffer_load_dwordx4 v[70:73], v3, s[16:19], 0 offen
+ v_add_u32_e32 v3, s10, v6
+ v_lshlrev_b32_e32 v157, 1, v6
+ v_cndmask_b32_e64 v5, v4, v157, s[8:9]
+ v_add_u32_e32 v6, s10, v3
+ v_lshlrev_b32_e32 v158, 1, v3
+ v_cndmask_b32_e64 v3, v4, v158, s[8:9]
+ buffer_load_dwordx4 v[74:77], v5, s[16:19], 0 offen
+ buffer_load_dwordx4 v[78:81], v3, s[16:19], 0 offen
+ v_add_u32_e32 v5, s10, v6
+ v_lshlrev_b32_e32 v159, 1, v6
+ v_add_u32_e32 v6, s10, v5
+ s_cmpk_gt_i32 s29, 0x7f
+ v_cndmask_b32_e64 v3, v4, v159, s[8:9]
+ v_lshlrev_b32_e32 v160, 1, v5
+ v_lshlrev_b32_e32 v161, 1, v6
+ s_cselect_b64 vcc, -1, 0
+ s_lshl_b32 s12, s11, 8
+ v_cndmask_b32_e64 v5, v4, v160, s[8:9]
+ buffer_load_dwordx4 v[82:85], v3, s[16:19], 0 offen
+ buffer_load_dwordx4 v[86:89], v5, s[16:19], 0 offen
+ v_cndmask_b32_e64 v3, v4, v161, s[8:9]
+ v_add_lshl_u32 v162, v6, s10, 1
+ s_mul_i32 s10, s12, s14
+ v_cndmask_b32_e64 v5, v4, v162, s[8:9]
+ buffer_load_dwordx4 v[90:93], v3, s[16:19], 0 offen
+ buffer_load_dwordx4 v[94:97], v5, s[16:19], 0 offen
+ s_lshl_b32 s26, s14, 5
+ s_ashr_i32 s11, s10, 31
+ v_mad_u64_u32 v[2:3], s[24:25], s14, v1, v[2:3]
+ s_and_b32 s21, s21, 0xffff
+ s_lshl_b64 s[10:11], s[10:11], 1
+ v_add_u32_e32 v3, s26, v2
+ s_add_u32 s16, s4, s10
+ v_add_u32_e32 v6, s26, v3
+ s_addc_u32 s17, s5, s11
+ v_add_u32_e32 v7, s26, v6
+ v_add_u32_e32 v8, s26, v7
+ s_add_u32 s24, s16, 0x80
+ v_lshlrev_b32_e32 v163, 1, v2
+ v_add_u32_e32 v9, s26, v8
+ s_addc_u32 s14, s17, 0
+ s_and_b32 s17, s17, 0xffff
+ v_cndmask_b32_e64 v2, v4, v163, s[8:9]
+ v_lshlrev_b32_e32 v164, 1, v3
+ v_lshlrev_b32_e32 v165, 1, v6
+ v_add_u32_e32 v10, s26, v9
+ v_cndmask_b32_e64 v3, v4, v164, s[8:9]
+ buffer_load_dwordx4 v[98:101], v2, s[16:19], 0 offen
+ buffer_load_dwordx4 v[102:105], v3, s[16:19], 0 offen
+ v_cndmask_b32_e64 v2, v4, v165, s[8:9]
+ v_lshlrev_b32_e32 v166, 1, v7
+ v_lshlrev_b32_e32 v167, 1, v8
+ v_cndmask_b32_e64 v3, v4, v166, s[8:9]
+ buffer_load_dwordx4 v[106:109], v2, s[16:19], 0 offen
+ buffer_load_dwordx4 v[110:113], v3, s[16:19], 0 offen
+ v_cndmask_b32_e64 v2, v4, v167, s[8:9]
+ v_lshlrev_b32_e32 v168, 1, v9
+ v_lshlrev_b32_e32 v169, 1, v10
+ v_cndmask_b32_e64 v3, v4, v168, s[8:9]
+ buffer_load_dwordx4 v[114:117], v2, s[16:19], 0 offen
+ buffer_load_dwordx4 v[118:121], v3, s[16:19], 0 offen
+ v_cndmask_b32_e64 v2, v4, v169, s[8:9]
+ v_add_lshl_u32 v170, v10, s26, 1
+ v_cndmask_b32_e64 v3, v4, v170, s[8:9]
+ buffer_load_dwordx4 v[122:125], v2, s[16:19], 0 offen
+ buffer_load_dwordx4 v[126:129], v3, s[16:19], 0 offen
+ s_mov_b32 s22, s18
+ s_mov_b32 s23, s19
+ v_cndmask_b32_e32 v5, v4, v155, vcc
+ v_cndmask_b32_e32 v2, v4, v156, vcc
+ buffer_load_dwordx4 v[54:57], v5, s[20:23], 0 offen
+ buffer_load_dwordx4 v[50:53], v2, s[20:23], 0 offen
+ v_cndmask_b32_e32 v2, v4, v157, vcc
+ v_cndmask_b32_e32 v3, v4, v158, vcc
+ buffer_load_dwordx4 v[46:49], v2, s[20:23], 0 offen
+ buffer_load_dwordx4 v[58:61], v3, s[20:23], 0 offen
+ v_cndmask_b32_e32 v2, v4, v159, vcc
+ v_cndmask_b32_e32 v3, v4, v160, vcc
+ buffer_load_dwordx4 v[38:41], v2, s[20:23], 0 offen
+ buffer_load_dwordx4 v[62:65], v3, s[20:23], 0 offen
+ v_cndmask_b32_e32 v2, v4, v161, vcc
+ v_cndmask_b32_e32 v3, v4, v162, vcc
+ buffer_load_dwordx4 v[34:37], v2, s[20:23], 0 offen
+ buffer_load_dwordx4 v[26:29], v3, s[20:23], 0 offen
+ s_and_b32 s25, s14, 0xffff
+ s_mov_b32 s26, s18
+ s_mov_b32 s27, s19
+ v_cndmask_b32_e32 v2, v4, v163, vcc
+ v_cndmask_b32_e32 v3, v4, v164, vcc
+ buffer_load_dwordx4 v[30:33], v2, s[24:27], 0 offen
+ buffer_load_dwordx4 v[22:25], v3, s[24:27], 0 offen
+ v_cndmask_b32_e32 v2, v4, v165, vcc
+ v_cndmask_b32_e32 v3, v4, v166, vcc
+ buffer_load_dwordx4 v[18:21], v2, s[24:27], 0 offen
+ buffer_load_dwordx4 v[14:17], v3, s[24:27], 0 offen
+ v_cndmask_b32_e32 v2, v4, v167, vcc
+ v_cndmask_b32_e32 v3, v4, v168, vcc
+ buffer_load_dwordx4 v[10:13], v2, s[24:27], 0 offen
+ buffer_load_dwordx4 v[6:9], v3, s[24:27], 0 offen
+ v_cndmask_b32_e32 v2, v4, v169, vcc
+ v_cndmask_b32_e32 v42, v4, v170, vcc
+ buffer_load_dwordx4 v[2:5], v2, s[24:27], 0 offen
+ s_nop 0
+ buffer_load_dwordx4 v[42:45], v42, s[24:27], 0 offen
+ v_and_b32_e32 v130, 56, v130
+ v_lshlrev_b32_e32 v130, 1, v130
+ v_lshl_or_b32 v130, v1, 7, v130
+ s_add_i32 s14, 0, 0x8000
+ v_add_u32_e32 v132, 0, v130
+ v_add_u32_e32 v131, s14, v130
+ s_waitcnt vmcnt(31)
+ ds_write_b128 v132, v[66:69]
+ s_waitcnt vmcnt(30)
+ ds_write_b128 v132, v[70:73] offset:4096
+ s_waitcnt vmcnt(29)
+ ds_write_b128 v132, v[74:77] offset:8192
+ s_waitcnt vmcnt(28)
+ ds_write_b128 v132, v[78:81] offset:12288
+ s_waitcnt vmcnt(27)
+ ds_write_b128 v132, v[82:85] offset:16384
+ s_waitcnt vmcnt(26)
+ ds_write_b128 v132, v[86:89] offset:20480
+ s_waitcnt vmcnt(25)
+ ds_write_b128 v132, v[90:93] offset:24576
+ s_waitcnt vmcnt(24)
+ ds_write_b128 v132, v[94:97] offset:28672
+ s_waitcnt vmcnt(23)
+ ds_write_b128 v132, v[98:101] offset:32768
+ s_waitcnt vmcnt(22)
+ ds_write_b128 v131, v[102:105] offset:4096
+ s_waitcnt vmcnt(21)
+ ds_write_b128 v131, v[106:109] offset:8192
+ s_waitcnt vmcnt(20)
+ ds_write_b128 v131, v[110:113] offset:12288
+ s_waitcnt vmcnt(19)
+ ds_write_b128 v131, v[114:117] offset:16384
+ s_waitcnt vmcnt(18)
+ ds_write_b128 v131, v[118:121] offset:20480
+ s_waitcnt vmcnt(17)
+ ds_write_b128 v131, v[122:125] offset:24576
+ s_waitcnt vmcnt(16)
+ ds_write_b128 v131, v[126:129] offset:28672
+ v_and_b32_e32 v66, 15, v0
+ v_bfe_u32 v106, v0, 4, 2
+ v_and_b32_e32 v99, 7, v0
+ v_lshrrev_b32_e32 v152, 2, v0
+ v_and_or_b32 v130, v1, 16, v66
+ v_xor_b32_e32 v1, v106, v99
+ v_and_or_b32 v0, v152, 16, v66
+ v_lshlrev_b32_e32 v102, 3, v1
+ v_lshlrev_b32_e32 v101, 6, v0
+ v_lshlrev_b32_e32 v98, 6, v130
+ v_or_b32_e32 v0, v102, v101
+ v_or_b32_e32 v1, v98, v102
+ v_lshlrev_b32_e32 v0, 1, v0
+ v_lshl_add_u32 v1, v1, 1, 0
+ v_add_u32_e32 v133, 0, v0
+ s_waitcnt lgkmcnt(0)
+ s_barrier
+ ds_read_b128 v[94:97], v1
+ ds_read_b128 v[90:93], v1 offset:4096
+ v_add_u32_e32 v0, s14, v0
+ ds_read_b128 v[78:81], v133 offset:32768
+ ds_read_b128 v[74:77], v0 offset:4096
+ ds_read_b128 v[86:89], v1 offset:8192
+ ds_read_b128 v[82:85], v1 offset:12288
+ ds_read_b128 v[70:73], v0 offset:8192
+ ds_read_b128 v[66:69], v0 offset:12288
+ s_mov_b32 s16, 0
+ v_or_b32_e32 v100, 0x800, v98
+ v_or_b32_e32 v103, 0x1000, v98
+ v_or_b32_e32 v104, 0x1800, v98
+ v_or_b32_e32 v105, 0x1000, v101
+ s_cmpk_gt_i32 s29, 0xbf
+ v_or_b32_e32 v106, 4, v106
+ s_cbranch_scc1 .LBB0_3
+; %bb.1: ; %.._crit_edge_crit_edge
+ v_xor_b32_e32 v113, v106, v99
+ v_or_b32_e32 v107, 0x2000, v101
+ v_or_b32_e32 v108, 0x3000, v101
+ v_lshlrev_b32_e32 v113, 3, v113
+ v_or_b32_e32 v109, 0x2000, v98
+ v_or_b32_e32 v110, 0x2800, v98
+ v_or_b32_e32 v111, 0x3000, v98
+ v_or_b32_e32 v112, 0x3800, v98
+ v_or_b32_e32 v140, v113, v107
+ v_or_b32_e32 v138, v113, v108
+ v_or_b32_e32 v154, v107, v102
+ v_or_b32_e32 v153, v108, v102
+ v_or_b32_e32 v151, v109, v102
+ v_or_b32_e32 v150, v110, v102
+ v_or_b32_e32 v149, v111, v102
+ v_or_b32_e32 v148, v112, v102
+ v_or_b32_e32 v147, v113, v101
+ v_or_b32_e32 v145, v113, v98
+ v_or_b32_e32 v146, v100, v113
+ v_or_b32_e32 v144, v113, v105
+ v_or_b32_e32 v142, v103, v113
+ v_or_b32_e32 v143, v104, v113
+ v_or_b32_e32 v141, 0x800, v140
+ v_or_b32_e32 v139, 0x800, v138
+ v_or_b32_e32 v136, v109, v113
+ v_or_b32_e32 v137, v110, v113
+ v_or_b32_e32 v134, v111, v113
+ v_or_b32_e32 v135, v112, v113
+ s_cbranch_execz .LBB0_4
+; %bb.2:
+ v_accvgpr_write_b32 a0, s16
+ v_accvgpr_write_b32 a1, s16
+ v_accvgpr_write_b32 a2, s16
+ v_accvgpr_write_b32 a3, s16
+ v_accvgpr_write_b32 a4, s16
+ v_accvgpr_write_b32 a5, s16
+ v_accvgpr_write_b32 a6, s16
+ v_accvgpr_write_b32 a7, s16
+ v_accvgpr_write_b32 a8, s16
+ v_accvgpr_write_b32 a9, s16
+ v_accvgpr_write_b32 a10, s16
+ v_accvgpr_write_b32 a11, s16
+ v_accvgpr_write_b32 a12, s16
+ v_accvgpr_write_b32 a13, s16
+ v_accvgpr_write_b32 a14, s16
+ v_accvgpr_write_b32 a15, s16
+ v_accvgpr_write_b32 a16, s16
+ v_accvgpr_write_b32 a17, s16
+ v_accvgpr_write_b32 a18, s16
+ v_accvgpr_write_b32 a19, s16
+ v_accvgpr_write_b32 a20, s16
+ v_accvgpr_write_b32 a21, s16
+ v_accvgpr_write_b32 a22, s16
+ v_accvgpr_write_b32 a23, s16
+ v_accvgpr_write_b32 a24, s16
+ v_accvgpr_write_b32 a25, s16
+ v_accvgpr_write_b32 a26, s16
+ v_accvgpr_write_b32 a27, s16
+ v_accvgpr_write_b32 a28, s16
+ v_accvgpr_write_b32 a29, s16
+ v_accvgpr_write_b32 a30, s16
+ v_accvgpr_write_b32 a31, s16
+ v_accvgpr_write_b32 a64, s16
+ v_accvgpr_write_b32 a65, s16
+ v_accvgpr_write_b32 a66, s16
+ v_accvgpr_write_b32 a67, s16
+ v_accvgpr_write_b32 a68, s16
+ v_accvgpr_write_b32 a69, s16
+ v_accvgpr_write_b32 a70, s16
+ v_accvgpr_write_b32 a71, s16
+ v_accvgpr_write_b32 a72, s16
+ v_accvgpr_write_b32 a73, s16
+ v_accvgpr_write_b32 a74, s16
+ v_accvgpr_write_b32 a75, s16
+ v_accvgpr_write_b32 a76, s16
+ v_accvgpr_write_b32 a77, s16
+ v_accvgpr_write_b32 a78, s16
+ v_accvgpr_write_b32 a79, s16
+ v_accvgpr_write_b32 a80, s16
+ v_accvgpr_write_b32 a81, s16
+ v_accvgpr_write_b32 a82, s16
+ v_accvgpr_write_b32 a83, s16
+ v_accvgpr_write_b32 a84, s16
+ v_accvgpr_write_b32 a85, s16
+ v_accvgpr_write_b32 a86, s16
+ v_accvgpr_write_b32 a87, s16
+ v_accvgpr_write_b32 a88, s16
+ v_accvgpr_write_b32 a89, s16
+ v_accvgpr_write_b32 a90, s16
+ v_accvgpr_write_b32 a91, s16
+ v_accvgpr_write_b32 a92, s16
+ v_accvgpr_write_b32 a93, s16
+ v_accvgpr_write_b32 a94, s16
+ v_accvgpr_write_b32 a95, s16
+ v_accvgpr_write_b32 a32, s16
+ v_accvgpr_write_b32 a33, s16
+ v_accvgpr_write_b32 a34, s16
+ v_accvgpr_write_b32 a35, s16
+ v_accvgpr_write_b32 a36, s16
+ v_accvgpr_write_b32 a37, s16
+ v_accvgpr_write_b32 a38, s16
+ v_accvgpr_write_b32 a39, s16
+ v_accvgpr_write_b32 a40, s16
+ v_accvgpr_write_b32 a41, s16
+ v_accvgpr_write_b32 a42, s16
+ v_accvgpr_write_b32 a43, s16
+ v_accvgpr_write_b32 a44, s16
+ v_accvgpr_write_b32 a45, s16
+ v_accvgpr_write_b32 a46, s16
+ v_accvgpr_write_b32 a47, s16
+ v_accvgpr_write_b32 a48, s16
+ v_accvgpr_write_b32 a49, s16
+ v_accvgpr_write_b32 a50, s16
+ v_accvgpr_write_b32 a51, s16
+ v_accvgpr_write_b32 a52, s16
+ v_accvgpr_write_b32 a53, s16
+ v_accvgpr_write_b32 a54, s16
+ v_accvgpr_write_b32 a55, s16
+ v_accvgpr_write_b32 a56, s16
+ v_accvgpr_write_b32 a57, s16
+ v_accvgpr_write_b32 a58, s16
+ v_accvgpr_write_b32 a59, s16
+ v_accvgpr_write_b32 a60, s16
+ v_accvgpr_write_b32 a61, s16
+ v_accvgpr_write_b32 a62, s16
+ v_accvgpr_write_b32 a63, s16
+ v_accvgpr_write_b32 a96, s16
+ v_accvgpr_write_b32 a97, s16
+ v_accvgpr_write_b32 a98, s16
+ v_accvgpr_write_b32 a99, s16
+ v_accvgpr_write_b32 a100, s16
+ v_accvgpr_write_b32 a101, s16
+ v_accvgpr_write_b32 a102, s16
+ v_accvgpr_write_b32 a103, s16
+ v_accvgpr_write_b32 a104, s16
+ v_accvgpr_write_b32 a105, s16
+ v_accvgpr_write_b32 a106, s16
+ v_accvgpr_write_b32 a107, s16
+ v_accvgpr_write_b32 a108, s16
+ v_accvgpr_write_b32 a109, s16
+ v_accvgpr_write_b32 a110, s16
+ v_accvgpr_write_b32 a111, s16
+ v_accvgpr_write_b32 a112, s16
+ v_accvgpr_write_b32 a113, s16
+ v_accvgpr_write_b32 a114, s16
+ v_accvgpr_write_b32 a115, s16
+ v_accvgpr_write_b32 a116, s16
+ v_accvgpr_write_b32 a117, s16
+ v_accvgpr_write_b32 a118, s16
+ v_accvgpr_write_b32 a119, s16
+ v_accvgpr_write_b32 a120, s16
+ v_accvgpr_write_b32 a121, s16
+ v_accvgpr_write_b32 a122, s16
+ v_accvgpr_write_b32 a123, s16
+ v_accvgpr_write_b32 a124, s16
+ v_accvgpr_write_b32 a125, s16
+ v_accvgpr_write_b32 a126, s16
+ v_accvgpr_write_b32 a127, s16
+ v_accvgpr_write_b32 a132, s16
+ v_accvgpr_write_b32 a133, s16
+ v_accvgpr_write_b32 a134, s16
+ v_accvgpr_write_b32 a135, s16
+ v_accvgpr_write_b32 a136, s16
+ v_accvgpr_write_b32 a137, s16
+ v_accvgpr_write_b32 a138, s16
+ v_accvgpr_write_b32 a139, s16
+ v_accvgpr_write_b32 a140, s16
+ v_accvgpr_write_b32 a141, s16
+ v_accvgpr_write_b32 a142, s16
+ v_accvgpr_write_b32 a143, s16
+ v_accvgpr_write_b32 a144, s16
+ v_accvgpr_write_b32 a145, s16
+ v_accvgpr_write_b32 a146, s16
+ v_accvgpr_write_b32 a147, s16
+ v_accvgpr_write_b32 a148, s16
+ v_accvgpr_write_b32 a149, s16
+ v_accvgpr_write_b32 a150, s16
+ v_accvgpr_write_b32 a151, s16
+ v_accvgpr_write_b32 a152, s16
+ v_accvgpr_write_b32 a153, s16
+ v_accvgpr_write_b32 a154, s16
+ v_accvgpr_write_b32 a155, s16
+ v_accvgpr_write_b32 a156, s16
+ v_accvgpr_write_b32 a157, s16
+ v_accvgpr_write_b32 a158, s16
+ v_accvgpr_write_b32 a159, s16
+ v_accvgpr_write_b32 a160, s16
+ v_accvgpr_write_b32 a161, s16
+ v_accvgpr_write_b32 a162, s16
+ v_accvgpr_write_b32 a163, s16
+ v_accvgpr_write_b32 a220, s16
+ v_accvgpr_write_b32 a221, s16
+ v_accvgpr_write_b32 a222, s16
+ v_accvgpr_write_b32 a223, s16
+ v_accvgpr_write_b32 a224, s16
+ v_accvgpr_write_b32 a225, s16
+ v_accvgpr_write_b32 a226, s16
+ v_accvgpr_write_b32 a227, s16
+ v_accvgpr_write_b32 a232, s16
+ v_accvgpr_write_b32 a233, s16
+ v_accvgpr_write_b32 a234, s16
+ v_accvgpr_write_b32 a235, s16
+ v_accvgpr_write_b32 a236, s16
+ v_accvgpr_write_b32 a237, s16
+ v_accvgpr_write_b32 a238, s16
+ v_accvgpr_write_b32 a239, s16
+ v_accvgpr_write_b32 a240, s16
+ v_accvgpr_write_b32 a241, s16
+ v_accvgpr_write_b32 a242, s16
+ v_accvgpr_write_b32 a243, s16
+ v_accvgpr_write_b32 a244, s16
+ v_accvgpr_write_b32 a245, s16
+ v_accvgpr_write_b32 a246, s16
+ v_accvgpr_write_b32 a247, s16
+ v_accvgpr_write_b32 a248, s16
+ v_accvgpr_write_b32 a249, s16
+ v_accvgpr_write_b32 a250, s16
+ v_accvgpr_write_b32 a251, s16
+ v_accvgpr_write_b32 a252, s16
+ v_accvgpr_write_b32 a253, s16
+ v_accvgpr_write_b32 a254, s16
+ v_accvgpr_write_b32 a255, s16
+ v_accvgpr_write_b32 a164, s16
+ v_accvgpr_write_b32 a165, s16
+ v_accvgpr_write_b32 a166, s16
+ v_accvgpr_write_b32 a167, s16
+ v_accvgpr_write_b32 a168, s16
+ v_accvgpr_write_b32 a169, s16
+ v_accvgpr_write_b32 a170, s16
+ v_accvgpr_write_b32 a171, s16
+ v_accvgpr_write_b32 a172, s16
+ v_accvgpr_write_b32 a173, s16
+ v_accvgpr_write_b32 a174, s16
+ v_accvgpr_write_b32 a175, s16
+ v_accvgpr_write_b32 a176, s16
+ v_accvgpr_write_b32 a177, s16
+ v_accvgpr_write_b32 a178, s16
+ v_accvgpr_write_b32 a179, s16
+ v_accvgpr_write_b32 a128, s16
+ v_accvgpr_write_b32 a129, s16
+ v_accvgpr_write_b32 a130, s16
+ v_accvgpr_write_b32 a131, s16
+ v_accvgpr_write_b32 a180, s16
+ v_accvgpr_write_b32 a181, s16
+ v_accvgpr_write_b32 a182, s16
+ v_accvgpr_write_b32 a183, s16
+ v_accvgpr_write_b32 a184, s16
+ v_accvgpr_write_b32 a185, s16
+ v_accvgpr_write_b32 a186, s16
+ v_accvgpr_write_b32 a187, s16
+ v_accvgpr_write_b32 a188, s16
+ v_accvgpr_write_b32 a189, s16
+ v_accvgpr_write_b32 a190, s16
+ v_accvgpr_write_b32 a191, s16
+ v_accvgpr_write_b32 a192, s16
+ v_accvgpr_write_b32 a193, s16
+ v_accvgpr_write_b32 a194, s16
+ v_accvgpr_write_b32 a195, s16
+ v_accvgpr_write_b32 a204, s16
+ v_accvgpr_write_b32 a205, s16
+ v_accvgpr_write_b32 a206, s16
+ v_accvgpr_write_b32 a207, s16
+ v_accvgpr_write_b32 a216, s16
+ v_accvgpr_write_b32 a217, s16
+ v_accvgpr_write_b32 a218, s16
+ v_accvgpr_write_b32 a219, s16
+ v_accvgpr_write_b32 a228, s16
+ v_accvgpr_write_b32 a229, s16
+ v_accvgpr_write_b32 a230, s16
+ v_accvgpr_write_b32 a231, s16
+ v_accvgpr_write_b32 a196, s16
+ v_accvgpr_write_b32 a197, s16
+ v_accvgpr_write_b32 a198, s16
+ v_accvgpr_write_b32 a199, s16
+ v_accvgpr_write_b32 a200, s16
+ v_accvgpr_write_b32 a201, s16
+ v_accvgpr_write_b32 a202, s16
+ v_accvgpr_write_b32 a203, s16
+ v_accvgpr_write_b32 a208, s16
+ v_accvgpr_write_b32 a209, s16
+ v_accvgpr_write_b32 a210, s16
+ v_accvgpr_write_b32 a211, s16
+ v_accvgpr_write_b32 a212, s16
+ v_accvgpr_write_b32 a213, s16
+ v_accvgpr_write_b32 a214, s16
+ v_accvgpr_write_b32 a215, s16
+ s_branch .LBB0_6
+.LBB0_3:
+ ; implicit-def: $sgpr16
+ ; implicit-def: $vgpr154
+ ; implicit-def: $vgpr153
+ ; implicit-def: $vgpr151
+ ; implicit-def: $vgpr150
+ ; implicit-def: $vgpr149
+ ; implicit-def: $vgpr148
+ ; implicit-def: $vgpr147
+ ; implicit-def: $vgpr145
+ ; implicit-def: $vgpr146
+ ; implicit-def: $vgpr144
+ ; implicit-def: $vgpr142
+ ; implicit-def: $vgpr143
+ ; implicit-def: $vgpr140
+ ; implicit-def: $vgpr141
+ ; implicit-def: $vgpr138
+ ; implicit-def: $vgpr139
+ ; implicit-def: $vgpr136
+ ; implicit-def: $vgpr137
+ ; implicit-def: $vgpr134
+ ; implicit-def: $vgpr135
+.LBB0_4: ; %.lr.ph
+ s_lshr_b32 s16, s29, 6
+ s_add_u32 s4, s4, s10
+ s_addc_u32 s5, s5, s11
+ s_mul_i32 s13, s13, s28
+ v_add_u32_e32 v108, v101, v102
+ v_xor_b32_e32 v99, v106, v99
+ v_lshl_add_u32 v108, v108, 1, s14
+ v_lshlrev_b32_e32 v99, 3, v99
+ v_or_b32_e32 v107, 0x2000, v101
+ v_add_u32_e32 v171, 0x1000, v108
+ v_or_b32_e32 v108, 0x3000, v101
+ v_or_b32_e32 v109, 0x2000, v98
+ v_or_b32_e32 v110, 0x2800, v98
+ v_or_b32_e32 v111, 0x3000, v98
+ v_or_b32_e32 v112, 0x3800, v98
+ v_or_b32_e32 v140, v99, v107
+ v_or_b32_e32 v138, v99, v108
+ v_or_b32_e32 v143, v104, v99
+ s_add_u32 s4, s4, 0x100
+ s_addc_u32 s5, s5, 0
+ s_lshl_b32 s8, s13, 8
+ s_ashr_i32 s9, s8, 31
+ s_lshl_b64 s[8:9], s[8:9], 1
+ s_add_u32 s2, s8, s2
+ s_addc_u32 s3, s9, s3
+ s_add_u32 s2, s2, 0x100
+ v_or_b32_e32 v141, 0x800, v140
+ v_or_b32_e32 v148, v112, v102
+ v_or_b32_e32 v149, v111, v102
+ v_or_b32_e32 v150, v110, v102
+ v_or_b32_e32 v151, v109, v102
+ v_or_b32_e32 v153, v108, v102
+ v_or_b32_e32 v154, v107, v102
+ v_or_b32_e32 v145, v99, v98
+ v_lshl_add_u32 v172, v145, 1, 0
+ v_or_b32_e32 v146, v100, v99
+ v_add_u32_e32 v98, v98, v99
+ v_lshl_add_u32 v173, v98, 1, 0
+ v_add_u32_e32 v98, v99, v101
+ v_lshl_add_u32 v175, v98, 1, s14
+ v_or_b32_e32 v147, v99, v101
+ v_lshl_add_u32 v174, v147, 1, 0
+ v_or_b32_e32 v144, v99, v105
+ v_or_b32_e32 v142, v103, v99
+ v_or_b32_e32 v139, 0x800, v138
+ v_or_b32_e32 v136, v109, v99
+ v_or_b32_e32 v137, v110, v99
+ v_or_b32_e32 v134, v111, v99
+ v_or_b32_e32 v135, v112, v99
+ s_addc_u32 s3, s3, 0
+ s_add_i32 s13, s16, -2
+ v_accvgpr_write_b32 a15, 0
+ v_accvgpr_write_b32 a14, 0
+ v_accvgpr_write_b32 a13, 0
+ v_accvgpr_write_b32 a12, 0
+ v_accvgpr_write_b32 a11, 0
+ v_accvgpr_write_b32 a10, 0
+ v_accvgpr_write_b32 a9, 0
+ v_accvgpr_write_b32 a8, 0
+ v_accvgpr_write_b32 a7, 0
+ v_accvgpr_write_b32 a6, 0
+ v_accvgpr_write_b32 a5, 0
+ v_accvgpr_write_b32 a4, 0
+ v_accvgpr_write_b32 a3, 0
+ v_accvgpr_write_b32 a2, 0
+ v_accvgpr_write_b32 a1, 0
+ v_accvgpr_write_b32 a0, 0
+ v_accvgpr_write_b32 a31, 0
+ v_accvgpr_write_b32 a30, 0
+ v_accvgpr_write_b32 a29, 0
+ v_accvgpr_write_b32 a28, 0
+ v_accvgpr_write_b32 a27, 0
+ v_accvgpr_write_b32 a26, 0
+ v_accvgpr_write_b32 a25, 0
+ v_accvgpr_write_b32 a24, 0
+ v_accvgpr_write_b32 a23, 0
+ v_accvgpr_write_b32 a22, 0
+ v_accvgpr_write_b32 a21, 0
+ v_accvgpr_write_b32 a20, 0
+ v_accvgpr_write_b32 a19, 0
+ v_accvgpr_write_b32 a18, 0
+ v_accvgpr_write_b32 a17, 0
+ v_accvgpr_write_b32 a16, 0
+ v_accvgpr_write_b32 a79, 0
+ v_accvgpr_write_b32 a78, 0
+ v_accvgpr_write_b32 a77, 0
+ v_accvgpr_write_b32 a76, 0
+ v_accvgpr_write_b32 a75, 0
+ v_accvgpr_write_b32 a74, 0
+ v_accvgpr_write_b32 a73, 0
+ v_accvgpr_write_b32 a72, 0
+ v_accvgpr_write_b32 a71, 0
+ v_accvgpr_write_b32 a70, 0
+ v_accvgpr_write_b32 a69, 0
+ v_accvgpr_write_b32 a68, 0
+ v_accvgpr_write_b32 a67, 0
+ v_accvgpr_write_b32 a66, 0
+ v_accvgpr_write_b32 a65, 0
+ v_accvgpr_write_b32 a64, 0
+ v_accvgpr_write_b32 a95, 0
+ v_accvgpr_write_b32 a94, 0
+ v_accvgpr_write_b32 a93, 0
+ v_accvgpr_write_b32 a92, 0
+ v_accvgpr_write_b32 a91, 0
+ v_accvgpr_write_b32 a90, 0
+ v_accvgpr_write_b32 a89, 0
+ v_accvgpr_write_b32 a88, 0
+ v_accvgpr_write_b32 a87, 0
+ v_accvgpr_write_b32 a86, 0
+ v_accvgpr_write_b32 a85, 0
+ v_accvgpr_write_b32 a84, 0
+ v_accvgpr_write_b32 a83, 0
+ v_accvgpr_write_b32 a82, 0
+ v_accvgpr_write_b32 a81, 0
+ v_accvgpr_write_b32 a80, 0
+ v_accvgpr_write_b32 a47, 0
+ v_accvgpr_write_b32 a46, 0
+ v_accvgpr_write_b32 a45, 0
+ v_accvgpr_write_b32 a44, 0
+ v_accvgpr_write_b32 a43, 0
+ v_accvgpr_write_b32 a42, 0
+ v_accvgpr_write_b32 a41, 0
+ v_accvgpr_write_b32 a40, 0
+ v_accvgpr_write_b32 a39, 0
+ v_accvgpr_write_b32 a38, 0
+ v_accvgpr_write_b32 a37, 0
+ v_accvgpr_write_b32 a36, 0
+ v_accvgpr_write_b32 a35, 0
+ v_accvgpr_write_b32 a34, 0
+ v_accvgpr_write_b32 a33, 0
+ v_accvgpr_write_b32 a32, 0
+ v_accvgpr_write_b32 a63, 0
+ v_accvgpr_write_b32 a62, 0
+ v_accvgpr_write_b32 a61, 0
+ v_accvgpr_write_b32 a60, 0
+ v_accvgpr_write_b32 a59, 0
+ v_accvgpr_write_b32 a58, 0
+ v_accvgpr_write_b32 a57, 0
+ v_accvgpr_write_b32 a56, 0
+ v_accvgpr_write_b32 a55, 0
+ v_accvgpr_write_b32 a54, 0
+ v_accvgpr_write_b32 a53, 0
+ v_accvgpr_write_b32 a52, 0
+ v_accvgpr_write_b32 a51, 0
+ v_accvgpr_write_b32 a50, 0
+ v_accvgpr_write_b32 a49, 0
+ v_accvgpr_write_b32 a48, 0
+ v_accvgpr_write_b32 a111, 0
+ v_accvgpr_write_b32 a110, 0
+ v_accvgpr_write_b32 a109, 0
+ v_accvgpr_write_b32 a108, 0
+ v_accvgpr_write_b32 a107, 0
+ v_accvgpr_write_b32 a106, 0
+ v_accvgpr_write_b32 a105, 0
+ v_accvgpr_write_b32 a104, 0
+ v_accvgpr_write_b32 a103, 0
+ v_accvgpr_write_b32 a102, 0
+ v_accvgpr_write_b32 a101, 0
+ v_accvgpr_write_b32 a100, 0
+ v_accvgpr_write_b32 a99, 0
+ v_accvgpr_write_b32 a98, 0
+ v_accvgpr_write_b32 a97, 0
+ v_accvgpr_write_b32 a96, 0
+ v_accvgpr_write_b32 a127, 0
+ v_accvgpr_write_b32 a126, 0
+ v_accvgpr_write_b32 a125, 0
+ v_accvgpr_write_b32 a124, 0
+ v_accvgpr_write_b32 a123, 0
+ v_accvgpr_write_b32 a122, 0
+ v_accvgpr_write_b32 a121, 0
+ v_accvgpr_write_b32 a120, 0
+ v_accvgpr_write_b32 a119, 0
+ v_accvgpr_write_b32 a118, 0
+ v_accvgpr_write_b32 a117, 0
+ v_accvgpr_write_b32 a116, 0
+ v_accvgpr_write_b32 a115, 0
+ v_accvgpr_write_b32 a114, 0
+ v_accvgpr_write_b32 a113, 0
+ v_accvgpr_write_b32 a112, 0
+ v_accvgpr_write_b32 a147, 0
+ v_accvgpr_write_b32 a146, 0
+ v_accvgpr_write_b32 a145, 0
+ v_accvgpr_write_b32 a144, 0
+ v_accvgpr_write_b32 a143, 0
+ v_accvgpr_write_b32 a142, 0
+ v_accvgpr_write_b32 a141, 0
+ v_accvgpr_write_b32 a140, 0
+ v_accvgpr_write_b32 a139, 0
+ v_accvgpr_write_b32 a138, 0
+ v_accvgpr_write_b32 a137, 0
+ v_accvgpr_write_b32 a136, 0
+ v_accvgpr_write_b32 a135, 0
+ v_accvgpr_write_b32 a134, 0
+ v_accvgpr_write_b32 a133, 0
+ v_accvgpr_write_b32 a132, 0
+ v_accvgpr_write_b32 a163, 0
+ v_accvgpr_write_b32 a162, 0
+ v_accvgpr_write_b32 a161, 0
+ v_accvgpr_write_b32 a160, 0
+ v_accvgpr_write_b32 a159, 0
+ v_accvgpr_write_b32 a158, 0
+ v_accvgpr_write_b32 a157, 0
+ v_accvgpr_write_b32 a156, 0
+ v_accvgpr_write_b32 a155, 0
+ v_accvgpr_write_b32 a154, 0
+ v_accvgpr_write_b32 a153, 0
+ v_accvgpr_write_b32 a152, 0
+ v_accvgpr_write_b32 a151, 0
+ v_accvgpr_write_b32 a150, 0
+ v_accvgpr_write_b32 a149, 0
+ v_accvgpr_write_b32 a148, 0
+ v_accvgpr_write_b32 a239, 0
+ v_accvgpr_write_b32 a238, 0
+ v_accvgpr_write_b32 a237, 0
+ v_accvgpr_write_b32 a236, 0
+ v_accvgpr_write_b32 a235, 0
+ v_accvgpr_write_b32 a234, 0
+ v_accvgpr_write_b32 a233, 0
+ v_accvgpr_write_b32 a232, 0
+ v_accvgpr_write_b32 a227, 0
+ v_accvgpr_write_b32 a226, 0
+ v_accvgpr_write_b32 a225, 0
+ v_accvgpr_write_b32 a224, 0
+ v_accvgpr_write_b32 a223, 0
+ v_accvgpr_write_b32 a222, 0
+ v_accvgpr_write_b32 a221, 0
+ v_accvgpr_write_b32 a220, 0
+ v_accvgpr_write_b32 a255, 0
+ v_accvgpr_write_b32 a254, 0
+ v_accvgpr_write_b32 a253, 0
+ v_accvgpr_write_b32 a252, 0
+ v_accvgpr_write_b32 a251, 0
+ v_accvgpr_write_b32 a250, 0
+ v_accvgpr_write_b32 a249, 0
+ v_accvgpr_write_b32 a248, 0
+ v_accvgpr_write_b32 a247, 0
+ v_accvgpr_write_b32 a246, 0
+ v_accvgpr_write_b32 a245, 0
+ v_accvgpr_write_b32 a244, 0
+ v_accvgpr_write_b32 a243, 0
+ v_accvgpr_write_b32 a242, 0
+ v_accvgpr_write_b32 a241, 0
+ v_accvgpr_write_b32 a240, 0
+ v_accvgpr_write_b32 a179, 0
+ v_accvgpr_write_b32 a178, 0
+ v_accvgpr_write_b32 a177, 0
+ v_accvgpr_write_b32 a176, 0
+ v_accvgpr_write_b32 a175, 0
+ v_accvgpr_write_b32 a174, 0
+ v_accvgpr_write_b32 a173, 0
+ v_accvgpr_write_b32 a172, 0
+ v_accvgpr_write_b32 a171, 0
+ v_accvgpr_write_b32 a170, 0
+ v_accvgpr_write_b32 a169, 0
+ v_accvgpr_write_b32 a168, 0
+ v_accvgpr_write_b32 a167, 0
+ v_accvgpr_write_b32 a166, 0
+ v_accvgpr_write_b32 a165, 0
+ v_accvgpr_write_b32 a164, 0
+ v_accvgpr_write_b32 a191, 0
+ v_accvgpr_write_b32 a190, 0
+ v_accvgpr_write_b32 a189, 0
+ v_accvgpr_write_b32 a188, 0
+ v_accvgpr_write_b32 a187, 0
+ v_accvgpr_write_b32 a186, 0
+ v_accvgpr_write_b32 a185, 0
+ v_accvgpr_write_b32 a184, 0
+ v_accvgpr_write_b32 a183, 0
+ v_accvgpr_write_b32 a182, 0
+ v_accvgpr_write_b32 a181, 0
+ v_accvgpr_write_b32 a180, 0
+ v_accvgpr_write_b32 a131, 0
+ v_accvgpr_write_b32 a130, 0
+ v_accvgpr_write_b32 a129, 0
+ v_accvgpr_write_b32 a128, 0
+ v_accvgpr_write_b32 a231, 0
+ v_accvgpr_write_b32 a230, 0
+ v_accvgpr_write_b32 a229, 0
+ v_accvgpr_write_b32 a228, 0
+ v_accvgpr_write_b32 a219, 0
+ v_accvgpr_write_b32 a218, 0
+ v_accvgpr_write_b32 a217, 0
+ v_accvgpr_write_b32 a216, 0
+ v_accvgpr_write_b32 a207, 0
+ v_accvgpr_write_b32 a206, 0
+ v_accvgpr_write_b32 a205, 0
+ v_accvgpr_write_b32 a204, 0
+ v_accvgpr_write_b32 a195, 0
+ v_accvgpr_write_b32 a194, 0
+ v_accvgpr_write_b32 a193, 0
+ v_accvgpr_write_b32 a192, 0
+ v_accvgpr_write_b32 a215, 0
+ v_accvgpr_write_b32 a214, 0
+ v_accvgpr_write_b32 a213, 0
+ v_accvgpr_write_b32 a212, 0
+ v_accvgpr_write_b32 a211, 0
+ v_accvgpr_write_b32 a210, 0
+ v_accvgpr_write_b32 a209, 0
+ v_accvgpr_write_b32 a208, 0
+ v_accvgpr_write_b32 a203, 0
+ v_accvgpr_write_b32 a202, 0
+ v_accvgpr_write_b32 a201, 0
+ v_accvgpr_write_b32 a200, 0
+ v_accvgpr_write_b32 a199, 0
+ v_accvgpr_write_b32 a198, 0
+ v_accvgpr_write_b32 a197, 0
+ v_accvgpr_write_b32 a196, 0
+ s_mov_b32 s11, 0x27000
+ s_mov_b32 s10, 0x7ffffffe
+.LBB0_5: ; =>This Inner Loop Header: Depth=1
+ s_waitcnt lgkmcnt(5)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[94:95], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(4)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[74:75], v[94:95], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[76:77], v[96:97], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[78:79], v[90:91], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[80:81], v[92:93], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[12:15], v[74:75], v[90:91], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[12:15], v[76:77], v[92:93], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[70:71], v[94:95], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[72:73], v[96:97], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[20:23], v[66:67], v[94:95], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[20:23], v[68:69], v[96:97], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[70:71], v[90:91], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[72:73], v[92:93], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[66:67], v[90:91], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[68:69], v[92:93], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[32:35], v[78:79], v[86:87], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[32:35], v[80:81], v[88:89], a[32:35]
+ ds_read_b128 v[106:109], v0 offset:16384
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[74:75], v[86:87], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[76:77], v[88:89], a[36:39]
+ ds_read_b128 v[110:113], v171 offset:16384
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[78:79], v[82:83], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[80:81], v[84:85], a[40:43]
+ ds_read_b128 v[98:101], v0 offset:24576
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[44:47], v[74:75], v[82:83], a[44:47]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[44:47], v[76:77], v[84:85], a[44:47]
+ ds_read_b128 v[102:105], v171 offset:24576
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[70:71], v[86:87], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[72:73], v[88:89], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[52:55], v[66:67], v[86:87], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[52:55], v[68:69], v[88:89], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[70:71], v[82:83], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[72:73], v[84:85], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[60:63], v[66:67], v[82:83], a[60:63]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[60:63], v[68:69], v[84:85], a[60:63]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ s_waitcnt lgkmcnt(3)
+ v_mfma_f32_16x16x16_f16 a[64:67], v[106:107], v[94:95], a[64:67]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[64:67], v[108:109], v[96:97], a[64:67]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[68:71], v[110:111], v[94:95], a[68:71]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[68:71], v[112:113], v[96:97], a[68:71]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[72:75], v[106:107], v[90:91], a[72:75]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[72:75], v[108:109], v[92:93], a[72:75]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[76:79], v[110:111], v[90:91], a[76:79]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[76:79], v[112:113], v[92:93], a[76:79]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[80:83], v[98:99], v[94:95], a[80:83]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[80:83], v[100:101], v[96:97], a[80:83]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[84:87], v[102:103], v[94:95], a[84:87]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[84:87], v[104:105], v[96:97], a[84:87]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[88:91], v[98:99], v[90:91], a[88:91]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[88:91], v[100:101], v[92:93], a[88:91]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[92:95], v[102:103], v[90:91], a[92:95]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[92:95], v[104:105], v[92:93], a[92:95]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[96:99], v[106:107], v[86:87], a[96:99]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[96:99], v[108:109], v[88:89], a[96:99]
+ ds_read_b128 v[90:93], v1 offset:16384
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[110:111], v[86:87], a[100:103]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[112:113], v[88:89], a[100:103]
+ ds_read_b128 v[114:117], v1 offset:20480
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[104:107], v[106:107], v[82:83], a[104:107]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[104:107], v[108:109], v[84:85], a[104:107]
+ ds_read_b128 v[176:179], v1 offset:24576
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[108:111], v[110:111], v[82:83], a[108:111]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[108:111], v[112:113], v[84:85], a[108:111]
+ ds_read_b128 v[180:183], v1 offset:28672
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[112:115], v[98:99], v[86:87], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[112:115], v[100:101], v[88:89], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[116:119], v[102:103], v[86:87], a[116:119]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[116:119], v[104:105], v[88:89], a[116:119]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[120:123], v[98:99], v[82:83], a[120:123]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[120:123], v[100:101], v[84:85], a[120:123]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[124:127], v[102:103], v[82:83], a[124:127]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[124:127], v[104:105], v[84:85], a[124:127]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ s_waitcnt lgkmcnt(3)
+ v_mfma_f32_16x16x16_f16 a[132:135], v[78:79], v[90:91], a[132:135]
+ ; sched_barrier mask(0x000007F6)
+ s_and_b32 s9, s3, 0xffff
+ s_mov_b32 s8, s2
+ v_mfma_f32_16x16x16_f16 a[132:135], v[80:81], v[92:93], a[132:135]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[136:139], v[74:75], v[90:91], a[136:139]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[136:139], v[76:77], v[92:93], a[136:139]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[140:143], v[78:79], v[114:115], a[140:143]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[140:143], v[80:81], v[116:117], a[140:143]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[144:147], v[74:75], v[114:115], a[144:147]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[144:147], v[76:77], v[116:117], a[144:147]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[148:151], v[70:71], v[90:91], a[148:151]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[148:151], v[72:73], v[92:93], a[148:151]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[152:155], v[66:67], v[90:91], a[152:155]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[152:155], v[68:69], v[92:93], a[152:155]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[156:159], v[70:71], v[114:115], a[156:159]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[156:159], v[72:73], v[116:117], a[156:159]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[160:163], v[66:67], v[114:115], a[160:163]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[160:163], v[68:69], v[116:117], a[160:163]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[164:167], v[78:79], v[176:177], a[164:167]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[164:167], v[80:81], v[178:179], a[164:167]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[168:171], v[74:75], v[176:177], a[168:171]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[168:171], v[76:77], v[178:179], a[168:171]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[172:175], v[78:79], v[180:181], a[172:175]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[172:175], v[80:81], v[182:183], a[172:175]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[176:179], v[74:75], v[180:181], a[176:179]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[176:179], v[76:77], v[182:183], a[176:179]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[128:131], v[70:71], v[176:177], a[128:131]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[128:131], v[72:73], v[178:179], a[128:131]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[180:183], v[66:67], v[176:177], a[180:183]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[180:183], v[68:69], v[178:179], a[180:183]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[184:187], v[70:71], v[180:181], a[184:187]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[184:187], v[72:73], v[182:183], a[184:187]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[188:191], v[66:67], v[180:181], a[188:191]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[188:191], v[68:69], v[182:183], a[188:191]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000406)
+ v_mfma_f32_16x16x16_f16 a[220:223], v[106:107], v[90:91], a[220:223]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[220:223], v[108:109], v[92:93], a[220:223]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[224:227], v[110:111], v[90:91], a[224:227]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[224:227], v[112:113], v[92:93], a[224:227]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[232:235], v[106:107], v[114:115], a[232:235]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[232:235], v[108:109], v[116:117], a[232:235]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[236:239], v[110:111], v[114:115], a[236:239]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[236:239], v[112:113], v[116:117], a[236:239]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000406)
+ ds_read_b128 v[94:97], v172
+ v_mfma_f32_16x16x16_f16 a[240:243], v[98:99], v[90:91], a[240:243]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[240:243], v[100:101], v[92:93], a[240:243]
+ ds_read_b128 v[118:121], v173 offset:4096
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[244:247], v[102:103], v[90:91], a[244:247]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[244:247], v[104:105], v[92:93], a[244:247]
+ ds_read_b128 v[78:81], v174 offset:32768
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[248:251], v[98:99], v[114:115], a[248:251]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[248:251], v[100:101], v[116:117], a[248:251]
+ ds_read_b128 v[82:85], v175 offset:4096
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[252:255], v[102:103], v[114:115], a[252:255]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[252:255], v[104:105], v[116:117], a[252:255]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000406)
+ ds_read_b128 v[122:125], v173 offset:12288
+ ds_read_b128 v[126:129], v173 offset:8192
+ ds_read_b128 v[66:69], v175 offset:12288
+ ds_read_b128 v[74:77], v175 offset:8192
+ v_mfma_f32_16x16x16_f16 a[192:195], v[106:107], v[176:177], a[192:195]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[192:195], v[108:109], v[178:179], a[192:195]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[204:207], v[110:111], v[176:177], a[204:207]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[204:207], v[112:113], v[178:179], a[204:207]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[216:219], v[106:107], v[180:181], a[216:219]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[216:219], v[108:109], v[182:183], a[216:219]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[228:231], v[110:111], v[180:181], a[228:231]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[228:231], v[112:113], v[182:183], a[228:231]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000406)
+ v_mfma_f32_16x16x16_f16 a[196:199], v[98:99], v[176:177], a[196:199]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[196:199], v[100:101], v[178:179], a[196:199]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[200:203], v[102:103], v[176:177], a[200:203]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[200:203], v[104:105], v[178:179], a[200:203]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[208:211], v[98:99], v[180:181], a[208:211]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[208:211], v[100:101], v[182:183], a[208:211]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[212:215], v[102:103], v[180:181], a[212:215]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[212:215], v[104:105], v[182:183], a[212:215]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000406)
+ s_waitcnt lgkmcnt(5)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[94:95], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(4)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[82:83], v[94:95], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[84:85], v[96:97], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[78:79], v[118:119], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[80:81], v[120:121], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[12:15], v[82:83], v[118:119], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[12:15], v[84:85], v[120:121], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000406)
+ ds_read_b128 v[70:73], v175 offset:16384
+ ds_read_b128 v[106:109], v175 offset:20480
+ ds_read_b128 v[102:105], v175 offset:24576
+ ds_read_b128 v[98:101], v175 offset:28672
+ s_waitcnt lgkmcnt(4)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[74:75], v[94:95], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[76:77], v[96:97], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[20:23], v[66:67], v[94:95], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[20:23], v[68:69], v[96:97], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[74:75], v[118:119], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[76:77], v[120:121], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[66:67], v[118:119], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[68:69], v[120:121], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000406)
+ ds_read_b128 v[90:93], v173 offset:16384
+ ds_read_b128 v[86:89], v173 offset:20480
+ ds_read_b128 v[114:117], v173 offset:24576
+ ds_read_b128 v[110:113], v173 offset:28672
+ v_mfma_f32_16x16x16_f16 a[32:35], v[78:79], v[126:127], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(0)
+ s_barrier
+ v_mfma_f32_16x16x16_f16 a[32:35], v[80:81], v[128:129], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[82:83], v[126:127], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[84:85], v[128:129], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[78:79], v[122:123], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[80:81], v[124:125], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[44:47], v[82:83], v[122:123], a[44:47]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[44:47], v[84:85], v[124:125], a[44:47]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000406)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[74:75], v[126:127], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[76:77], v[128:129], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[52:55], v[66:67], v[126:127], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[52:55], v[68:69], v[128:129], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[74:75], v[122:123], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[76:77], v[124:125], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[60:63], v[66:67], v[122:123], a[60:63]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[60:63], v[68:69], v[124:125], a[60:63]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000406)
+ s_waitcnt vmcnt(12)
+ ds_write_b128 v132, v[58:61] offset:12288
+ buffer_load_dwordx4 v[58:61], v158, s[8:11], 0 offen
+ ds_write_b128 v132, v[46:49] offset:8192
+ buffer_load_dwordx4 v[46:49], v157, s[8:11], 0 offen
+ v_mfma_f32_16x16x16_f16 a[64:67], v[70:71], v[94:95], a[64:67]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[64:67], v[72:73], v[96:97], a[64:67]
+ s_waitcnt vmcnt(12)
+ ds_write_b128 v132, v[62:65] offset:20480
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[68:71], v[106:107], v[94:95], a[68:71]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[68:71], v[108:109], v[96:97], a[68:71]
+ buffer_load_dwordx4 v[62:65], v160, s[8:11], 0 offen
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[72:75], v[70:71], v[118:119], a[72:75]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[72:75], v[72:73], v[120:121], a[72:75]
+ ds_write_b128 v132, v[38:41] offset:16384
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[76:79], v[106:107], v[118:119], a[76:79]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[76:79], v[108:109], v[120:121], a[76:79]
+ buffer_load_dwordx4 v[38:41], v159, s[8:11], 0 offen
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[80:83], v[102:103], v[94:95], a[80:83]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[80:83], v[104:105], v[96:97], a[80:83]
+ s_waitcnt vmcnt(12)
+ ds_write_b128 v132, v[26:29] offset:28672
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[84:87], v[98:99], v[94:95], a[84:87]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[84:87], v[100:101], v[96:97], a[84:87]
+ buffer_load_dwordx4 v[26:29], v162, s[8:11], 0 offen
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[88:91], v[102:103], v[118:119], a[88:91]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[88:91], v[104:105], v[120:121], a[88:91]
+ ds_write_b128 v132, v[34:37] offset:24576
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[92:95], v[98:99], v[118:119], a[92:95]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[92:95], v[100:101], v[120:121], a[92:95]
+ buffer_load_dwordx4 v[34:37], v161, s[8:11], 0 offen
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[96:99], v[70:71], v[126:127], a[96:99]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[96:99], v[72:73], v[128:129], a[96:99]
+ ds_write_b128 v132, v[54:57]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[106:107], v[126:127], a[100:103]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[108:109], v[128:129], a[100:103]
+ buffer_load_dwordx4 v[54:57], v155, s[8:11], 0 offen
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[104:107], v[70:71], v[122:123], a[104:107]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[104:107], v[72:73], v[124:125], a[104:107]
+ ds_write_b128 v132, v[50:53] offset:4096
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[108:111], v[106:107], v[122:123], a[108:111]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[108:111], v[108:109], v[124:125], a[108:111]
+ ; sched_barrier mask(0x000007F6)
+ buffer_load_dwordx4 v[50:53], v156, s[8:11], 0 offen
+ s_and_b32 s9, s5, 0xffff
+ s_mov_b32 s8, s4
+ v_mfma_f32_16x16x16_f16 a[112:115], v[102:103], v[126:127], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[112:115], v[104:105], v[128:129], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[116:119], v[98:99], v[126:127], a[116:119]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[116:119], v[100:101], v[128:129], a[116:119]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[120:123], v[102:103], v[122:123], a[120:123]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[120:123], v[104:105], v[124:125], a[120:123]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[124:127], v[98:99], v[122:123], a[124:127]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[124:127], v[100:101], v[124:125], a[124:127]
+ ; sched_barrier mask(0x00000406)
+ s_waitcnt vmcnt(15)
+ ds_write_b128 v132, v[30:33] offset:32768
+ buffer_load_dwordx4 v[30:33], v163, s[8:11], 0 offen
+ s_waitcnt vmcnt(15)
+ ds_write_b128 v131, v[22:25] offset:4096
+ buffer_load_dwordx4 v[22:25], v164, s[8:11], 0 offen
+ s_waitcnt vmcnt(15)
+ ds_write_b128 v131, v[18:21] offset:8192
+ buffer_load_dwordx4 v[18:21], v165, s[8:11], 0 offen
+ s_waitcnt vmcnt(15)
+ ds_write_b128 v131, v[14:17] offset:12288
+ buffer_load_dwordx4 v[14:17], v166, s[8:11], 0 offen
+ s_waitcnt vmcnt(15)
+ ds_write_b128 v131, v[10:13] offset:16384
+ buffer_load_dwordx4 v[10:13], v167, s[8:11], 0 offen
+ s_waitcnt vmcnt(15)
+ ds_write_b128 v131, v[6:9] offset:20480
+ buffer_load_dwordx4 v[6:9], v168, s[8:11], 0 offen
+ s_waitcnt vmcnt(15)
+ ds_write_b128 v131, v[2:5] offset:24576
+ buffer_load_dwordx4 v[2:5], v169, s[8:11], 0 offen
+ s_waitcnt vmcnt(15)
+ ds_write_b128 v131, v[42:45] offset:28672
+ buffer_load_dwordx4 v[42:45], v170, s[8:11], 0 offen
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[132:135], v[78:79], v[90:91], a[132:135]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[132:135], v[80:81], v[92:93], a[132:135]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[136:139], v[82:83], v[90:91], a[136:139]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[136:139], v[84:85], v[92:93], a[136:139]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[140:143], v[78:79], v[86:87], a[140:143]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[140:143], v[80:81], v[88:89], a[140:143]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[144:147], v[82:83], v[86:87], a[144:147]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[144:147], v[84:85], v[88:89], a[144:147]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[148:151], v[74:75], v[90:91], a[148:151]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[148:151], v[76:77], v[92:93], a[148:151]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[152:155], v[66:67], v[90:91], a[152:155]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[152:155], v[68:69], v[92:93], a[152:155]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[156:159], v[74:75], v[86:87], a[156:159]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[156:159], v[76:77], v[88:89], a[156:159]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[160:163], v[66:67], v[86:87], a[160:163]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[160:163], v[68:69], v[88:89], a[160:163]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[164:167], v[78:79], v[114:115], a[164:167]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[164:167], v[80:81], v[116:117], a[164:167]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[168:171], v[82:83], v[114:115], a[168:171]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[168:171], v[84:85], v[116:117], a[168:171]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[172:175], v[78:79], v[110:111], a[172:175]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[172:175], v[80:81], v[112:113], a[172:175]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[176:179], v[82:83], v[110:111], a[176:179]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[176:179], v[84:85], v[112:113], a[176:179]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[128:131], v[74:75], v[114:115], a[128:131]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[128:131], v[76:77], v[116:117], a[128:131]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[180:183], v[66:67], v[114:115], a[180:183]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[180:183], v[68:69], v[116:117], a[180:183]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[184:187], v[74:75], v[110:111], a[184:187]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[184:187], v[76:77], v[112:113], a[184:187]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[188:191], v[66:67], v[110:111], a[188:191]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[188:191], v[68:69], v[112:113], a[188:191]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[220:223], v[70:71], v[90:91], a[220:223]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[220:223], v[72:73], v[92:93], a[220:223]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[224:227], v[106:107], v[90:91], a[224:227]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[224:227], v[108:109], v[92:93], a[224:227]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[232:235], v[70:71], v[86:87], a[232:235]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[232:235], v[72:73], v[88:89], a[232:235]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[236:239], v[106:107], v[86:87], a[236:239]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[236:239], v[108:109], v[88:89], a[236:239]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[240:243], v[102:103], v[90:91], a[240:243]
+ s_waitcnt lgkmcnt(0)
+ ; sched_barrier mask(0x000007F6)
+ s_barrier
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[240:243], v[104:105], v[92:93], a[240:243]
+ ds_read_b128 v[94:97], v1
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[244:247], v[98:99], v[90:91], a[244:247]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[244:247], v[100:101], v[92:93], a[244:247]
+ ds_read_b128 v[90:93], v1 offset:4096
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[248:251], v[102:103], v[86:87], a[248:251]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[248:251], v[104:105], v[88:89], a[248:251]
+ ds_read_b128 v[78:81], v133 offset:32768
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[252:255], v[98:99], v[86:87], a[252:255]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[252:255], v[100:101], v[88:89], a[252:255]
+ ds_read_b128 v[74:77], v0 offset:4096
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[192:195], v[70:71], v[114:115], a[192:195]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[192:195], v[72:73], v[116:117], a[192:195]
+ ds_read_b128 v[86:89], v1 offset:8192
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[204:207], v[106:107], v[114:115], a[204:207]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[204:207], v[108:109], v[116:117], a[204:207]
+ ds_read_b128 v[82:85], v1 offset:12288
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[216:219], v[70:71], v[110:111], a[216:219]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[216:219], v[72:73], v[112:113], a[216:219]
+ ds_read_b128 v[70:73], v0 offset:8192
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ v_mfma_f32_16x16x16_f16 a[228:231], v[106:107], v[110:111], a[228:231]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[228:231], v[108:109], v[112:113], a[228:231]
+ ds_read_b128 v[66:69], v0 offset:12288
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(2) SyncID(0)
+ ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[196:199], v[102:103], v[114:115], a[196:199]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[196:199], v[104:105], v[116:117], a[196:199]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[200:203], v[98:99], v[114:115], a[200:203]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[200:203], v[100:101], v[116:117], a[200:203]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[208:211], v[102:103], v[110:111], a[208:211]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[208:211], v[104:105], v[112:113], a[208:211]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[212:215], v[98:99], v[110:111], a[212:215]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[212:215], v[100:101], v[112:113], a[212:215]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_group_barrier mask(0x00000008) size(8) SyncID(0)
+ ; sched_barrier mask(0x00000000)
+ s_add_u32 s4, s4, 0x80
+ s_addc_u32 s5, s5, 0
+ s_add_u32 s2, s2, 0x80
+ s_addc_u32 s3, s3, 0
+ s_add_i32 s13, s13, -1
+ s_cmp_lg_u32 s13, 0
+ s_cbranch_scc1 .LBB0_5
+.LBB0_6: ; %Flow430
+ s_waitcnt lgkmcnt(5)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[94:95], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_load_dword s4, s[0:1], 0x34
+ v_and_b32_e32 v108, 28, v152
+ v_or_b32_e32 v251, 0xe0, v108
+ v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_or_b32_e32 v106, 0x60, v108
+ v_or_b32_e32 v107, 64, v108
+ v_or_b32_e32 v109, 32, v108
+ s_nop 3
+ v_accvgpr_read_b32 v105, a3
+ v_accvgpr_read_b32 v104, a2
+ v_accvgpr_read_b32 v103, a1
+ v_accvgpr_read_b32 v102, a0
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[94:95], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[76:77], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v163, a3
+ v_accvgpr_read_b32 v162, a2
+ v_accvgpr_read_b32 v161, a1
+ v_accvgpr_read_b32 v160, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[90:91], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v171, a3
+ v_accvgpr_read_b32 v170, a2
+ v_accvgpr_read_b32 v169, a1
+ v_accvgpr_read_b32 v168, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[90:91], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[76:77], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v183, a3
+ v_accvgpr_read_b32 v182, a2
+ v_accvgpr_read_b32 v181, a1
+ v_accvgpr_read_b32 v180, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[94:95], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v159, a3
+ v_accvgpr_read_b32 v158, a2
+ v_accvgpr_read_b32 v157, a1
+ v_accvgpr_read_b32 v156, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[94:95], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[68:69], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v167, a3
+ v_accvgpr_read_b32 v166, a2
+ v_accvgpr_read_b32 v165, a1
+ v_accvgpr_read_b32 v164, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[90:91], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v175, a3
+ v_accvgpr_read_b32 v174, a2
+ v_accvgpr_read_b32 v173, a1
+ v_accvgpr_read_b32 v172, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[90:91], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[68:69], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v191, a3
+ v_accvgpr_read_b32 v190, a2
+ v_accvgpr_read_b32 v189, a1
+ v_accvgpr_read_b32 v188, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[86:87], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[80:81], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[86:87], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[76:77], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v179, a3
+ v_accvgpr_read_b32 v178, a2
+ v_accvgpr_read_b32 v177, a1
+ v_accvgpr_read_b32 v176, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[82:83], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[80:81], v[84:85], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v187, a3
+ v_accvgpr_read_b32 v186, a2
+ v_accvgpr_read_b32 v185, a1
+ v_accvgpr_read_b32 v184, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[82:83], a[44:47]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[76:77], v[84:85], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v195, a3
+ v_accvgpr_read_b32 v194, a2
+ v_accvgpr_read_b32 v193, a1
+ v_accvgpr_read_b32 v192, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[86:87], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v199, a3
+ v_accvgpr_read_b32 v198, a2
+ v_accvgpr_read_b32 v197, a1
+ v_accvgpr_read_b32 v196, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[86:87], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[68:69], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v203, a3
+ v_accvgpr_read_b32 v202, a2
+ v_accvgpr_read_b32 v201, a1
+ v_accvgpr_read_b32 v200, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[82:83], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[84:85], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v207, a3
+ v_accvgpr_read_b32 v206, a2
+ v_accvgpr_read_b32 v205, a1
+ v_accvgpr_read_b32 v204, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[82:83], a[60:63]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[68:69], v[84:85], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v211, a3
+ v_accvgpr_read_b32 v210, a2
+ v_accvgpr_read_b32 v209, a1
+ v_accvgpr_read_b32 v208, a0
+ ; sched_barrier mask(0x00000000)
+ v_lshlrev_b32_e32 v111, 1, v154
+ v_add_u32_e32 v110, 0, v111
+ ds_read_b128 v[98:101], v110 offset:32768
+ v_add_u32_e32 v111, s14, v111
+ ds_read_b128 v[114:117], v111 offset:4096
+ v_lshlrev_b32_e32 v113, 1, v153
+ v_add_u32_e32 v112, 0, v113
+ ds_read_b128 v[118:121], v112 offset:32768
+ v_add_u32_e32 v113, s14, v113
+ ds_read_b128 v[122:125], v113 offset:4096
+ s_waitcnt lgkmcnt(3)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[94:95], a[64:67]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[100:101], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v215, a3
+ v_accvgpr_read_b32 v214, a2
+ v_accvgpr_read_b32 v213, a1
+ v_accvgpr_read_b32 v212, a0
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[94:95], a[68:71]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[20:23], v[116:117], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[90:91], a[72:75]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[100:101], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v219, a3
+ v_accvgpr_read_b32 v218, a2
+ v_accvgpr_read_b32 v217, a1
+ v_accvgpr_read_b32 v216, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[90:91], a[76:79]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[116:117], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[94:95], a[80:83]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[120:121], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v155, a3
+ v_accvgpr_read_b32 v154, a2
+ v_accvgpr_read_b32 v153, a1
+ v_accvgpr_read_b32 v152, a0
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[94:95], a[84:87]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[124:125], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[90:91], a[88:91]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[12:15], v[120:121], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[90:91], a[92:95]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[124:125], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v223, a3
+ v_accvgpr_read_b32 v222, a2
+ v_accvgpr_read_b32 v221, a1
+ v_accvgpr_read_b32 v220, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[86:87], a[96:99]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[52:55], v[100:101], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[86:87], a[100:103]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[116:117], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v227, a3
+ v_accvgpr_read_b32 v226, a2
+ v_accvgpr_read_b32 v225, a1
+ v_accvgpr_read_b32 v224, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[82:83], a[104:107]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[100:101], v[84:85], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[82:83], a[108:111]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[116:117], v[84:85], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v231, a3
+ v_accvgpr_read_b32 v230, a2
+ v_accvgpr_read_b32 v229, a1
+ v_accvgpr_read_b32 v228, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[86:87], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[120:121], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v235, a3
+ v_accvgpr_read_b32 v234, a2
+ v_accvgpr_read_b32 v233, a1
+ v_accvgpr_read_b32 v232, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[86:87], a[116:119]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[124:125], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v239, a3
+ v_accvgpr_read_b32 v238, a2
+ v_accvgpr_read_b32 v237, a1
+ v_accvgpr_read_b32 v236, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[82:83], a[120:123]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[120:121], v[84:85], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v243, a3
+ v_accvgpr_read_b32 v242, a2
+ v_accvgpr_read_b32 v241, a1
+ v_accvgpr_read_b32 v240, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[82:83], a[124:127]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[124:125], v[84:85], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v247, a3
+ v_accvgpr_read_b32 v246, a2
+ v_accvgpr_read_b32 v245, a1
+ v_accvgpr_read_b32 v244, a0
+ ; sched_barrier mask(0x00000000)
+ v_lshl_add_u32 v82, v151, 1, 0
+ ds_read_b128 v[86:89], v82
+ v_lshl_add_u32 v83, v149, 1, 0
+ ds_read_b128 v[94:97], v83
+ v_lshl_add_u32 v84, v150, 1, 0
+ ds_read_b128 v[90:93], v84
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[86:87], a[132:135]
+ ; sched_barrier mask(0x000007F6)
+ v_lshl_add_u32 v85, v148, 1, 0
+ ds_read_b128 v[126:129], v85
+ v_mfma_f32_16x16x16_f16 a[132:135], v[80:81], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[86:87], a[136:139]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[112:115], v[76:77], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[90:91], a[140:143]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[92:95], v[80:81], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[90:91], a[144:147]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[96:99], v[76:77], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[86:87], a[148:151]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[72:73], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[86:87], a[152:155]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[104:107], v[68:69], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[90:91], a[156:159]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[72:73], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v255, a3
+ v_accvgpr_read_b32 v254, a2
+ v_accvgpr_read_b32 v253, a1
+ v_accvgpr_read_b32 v252, a0
+ v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[90:91], a[160:163]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[108:111], v[68:69], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[94:95], a[164:167]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[44:47], v[80:81], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[94:95], a[168:171]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[76:77], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[78:79], v[126:127], a[172:175]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[88:91], v[80:81], v[128:129], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[74:75], v[126:127], a[176:179]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[80:83], v[76:77], v[128:129], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[94:95], a[128:131]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[124:127], v[72:73], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[94:95], a[180:183]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[76:79], v[68:69], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[70:71], v[126:127], a[184:187]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[72:73], v[128:129], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[66:67], v[126:127], a[188:191]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[84:87], v[68:69], v[128:129], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[86:87], a[220:223]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[220:223], v[100:101], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[86:87], a[224:227]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[120:123], v[116:117], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[90:91], a[232:235]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[224:227], v[100:101], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[90:91], a[236:239]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[232:235], v[116:117], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[86:87], a[240:243]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[72:75], v[120:121], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[86:87], a[244:247]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[68:71], v[124:125], v[88:89], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[118:119], v[90:91], a[248:251]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[64:67], v[120:121], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[122:123], v[90:91], a[252:255]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[60:63], v[124:125], v[92:93], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[94:95], a[192:195]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[100:101], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[94:95], a[204:207]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[32:35], v[116:117], v[96:97], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[98:99], v[126:127], a[216:219]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[100:101], v[128:129], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[114:115], v[126:127], a[228:231]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[116:117], v[128:129], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[118:119], v[94:95], a[196:199]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[116:119], v[120:121], v[96:97], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[122:123], v[94:95], a[200:203]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[164:167], v[124:125], v[96:97], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[118:119], v[126:127], a[208:211]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[172:175], v[120:121], v[128:129], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[122:123], v[126:127], a[212:215]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[168:171], v[124:125], v[128:129], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000000)
+ v_lshlrev_b32_e32 v70, 1, v147
+ v_add_u32_e32 v80, 0, v70
+ ds_read_b128 v[86:89], v80 offset:32768
+ v_lshl_add_u32 v81, v145, 1, 0
+ ds_read_b128 v[66:69], v81
+ s_nop 0
+ v_accvgpr_write_b32 a24, v102
+ v_accvgpr_write_b32 a25, v103
+ v_accvgpr_write_b32 a26, v104
+ v_accvgpr_write_b32 a27, v105
+ v_lshl_add_u32 v75, v146, 1, 0
+ v_add_u32_e32 v76, s14, v70
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[86:87], v[66:67], a[24:27]
+ ds_read_b128 v[90:93], v75
+ ds_read_b128 v[94:97], v76 offset:4096
+ ; sched_barrier mask(0x000007F6)
+ v_lshlrev_b32_e32 v70, 1, v144
+ v_add_u32_e32 v79, 0, v70
+ v_mfma_f32_16x16x16_f16 a[204:207], v[88:89], v[68:69], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[98:101], v79 offset:32768
+ v_add_u32_e32 v77, s14, v70
+ ds_read_b128 v[114:117], v77 offset:4096
+ v_accvgpr_write_b32 a24, v160
+ v_accvgpr_write_b32 a25, v161
+ v_accvgpr_write_b32 a26, v162
+ v_accvgpr_write_b32 a27, v163
+ v_lshl_add_u32 v78, v142, 1, 0
+ ds_read_b128 v[118:121], v78
+ s_waitcnt lgkmcnt(3)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[94:95], v[66:67], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_lshl_add_u32 v74, v143, 1, 0
+ ds_read_b128 v[122:125], v74
+ v_mfma_f32_16x16x16_f16 a[208:211], v[96:97], v[68:69], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 3
+ v_accvgpr_write_b32 a24, v168
+ v_accvgpr_write_b32 a25, v169
+ v_accvgpr_write_b32 a26, v170
+ v_accvgpr_write_b32 a27, v171
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[86:87], v[90:91], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[212:215], v[88:89], v[92:93], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a24, v180
+ v_accvgpr_write_b32 a25, v181
+ v_accvgpr_write_b32 a26, v182
+ v_accvgpr_write_b32 a27, v183
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[94:95], v[90:91], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[216:219], v[96:97], v[92:93], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a24, v156
+ v_accvgpr_write_b32 a25, v157
+ v_accvgpr_write_b32 a26, v158
+ v_accvgpr_write_b32 a27, v159
+ s_waitcnt lgkmcnt(3)
+ s_nop 0
+ v_mfma_f32_16x16x16_f16 a[24:27], v[98:99], v[66:67], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[244:247], v[100:101], v[68:69], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a24, v164
+ v_accvgpr_write_b32 a25, v165
+ v_accvgpr_write_b32 a26, v166
+ v_accvgpr_write_b32 a27, v167
+ s_waitcnt lgkmcnt(2)
+ s_nop 0
+ v_mfma_f32_16x16x16_f16 a[24:27], v[114:115], v[66:67], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[228:231], v[116:117], v[68:69], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a24, v172
+ v_accvgpr_write_b32 a25, v173
+ v_accvgpr_write_b32 a26, v174
+ v_accvgpr_write_b32 a27, v175
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[98:99], v[90:91], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[248:251], v[100:101], v[92:93], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a24, v188
+ v_accvgpr_write_b32 a25, v189
+ v_accvgpr_write_b32 a26, v190
+ v_accvgpr_write_b32 a27, v191
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[114:115], v[90:91], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[236:239], v[116:117], v[92:93], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[86:87], v[118:119], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[88:89], v[120:121], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v105, a27
+ v_accvgpr_read_b32 v104, a26
+ v_accvgpr_read_b32 v103, a25
+ v_accvgpr_read_b32 v102, a24
+ v_accvgpr_write_b32 a24, v176
+ v_accvgpr_write_b32 a25, v177
+ v_accvgpr_write_b32 a26, v178
+ v_accvgpr_write_b32 a27, v179
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[94:95], v[118:119], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[96:97], v[120:121], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v159, a27
+ v_accvgpr_read_b32 v158, a26
+ v_accvgpr_read_b32 v157, a25
+ v_accvgpr_read_b32 v156, a24
+ v_accvgpr_write_b32 a24, v184
+ v_accvgpr_write_b32 a25, v185
+ v_accvgpr_write_b32 a26, v186
+ v_accvgpr_write_b32 a27, v187
+ s_waitcnt lgkmcnt(0)
+ s_nop 0
+ v_mfma_f32_16x16x16_f16 a[24:27], v[86:87], v[122:123], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[88:89], v[124:125], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v163, a27
+ v_accvgpr_read_b32 v162, a26
+ v_accvgpr_read_b32 v161, a25
+ v_accvgpr_read_b32 v160, a24
+ v_accvgpr_write_b32 a24, v192
+ v_accvgpr_write_b32 a25, v193
+ v_accvgpr_write_b32 a26, v194
+ v_accvgpr_write_b32 a27, v195
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[94:95], v[122:123], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[96:97], v[124:125], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v167, a27
+ v_accvgpr_read_b32 v166, a26
+ v_accvgpr_read_b32 v165, a25
+ v_accvgpr_read_b32 v164, a24
+ v_accvgpr_write_b32 a24, v196
+ v_accvgpr_write_b32 a25, v197
+ v_accvgpr_write_b32 a26, v198
+ v_accvgpr_write_b32 a27, v199
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[98:99], v[118:119], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[100:101], v[120:121], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v171, a27
+ v_accvgpr_read_b32 v170, a26
+ v_accvgpr_read_b32 v169, a25
+ v_accvgpr_read_b32 v168, a24
+ v_accvgpr_write_b32 a24, v200
+ v_accvgpr_write_b32 a25, v201
+ v_accvgpr_write_b32 a26, v202
+ v_accvgpr_write_b32 a27, v203
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[114:115], v[118:119], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[116:117], v[120:121], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v175, a27
+ v_accvgpr_read_b32 v174, a26
+ v_accvgpr_read_b32 v173, a25
+ v_accvgpr_read_b32 v172, a24
+ v_accvgpr_write_b32 a24, v204
+ v_accvgpr_write_b32 a25, v205
+ v_accvgpr_write_b32 a26, v206
+ v_accvgpr_write_b32 a27, v207
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[98:99], v[122:123], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[100:101], v[124:125], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v179, a27
+ v_accvgpr_read_b32 v178, a26
+ v_accvgpr_read_b32 v177, a25
+ v_accvgpr_read_b32 v176, a24
+ v_accvgpr_write_b32 a24, v208
+ v_accvgpr_write_b32 a25, v209
+ v_accvgpr_write_b32 a26, v210
+ v_accvgpr_write_b32 a27, v211
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[24:27], v[114:115], v[122:123], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[116:117], v[124:125], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v183, a27
+ v_accvgpr_read_b32 v182, a26
+ v_accvgpr_read_b32 v181, a25
+ v_accvgpr_read_b32 v180, a24
+ ; sched_barrier mask(0x00000000)
+ v_lshl_add_u32 v73, v140, 1, 0
+ ds_read_b128 v[126:129], v73 offset:32768
+ v_lshl_add_u32 v71, v141, 1, 0
+ ds_read_b128 v[140:143], v71 offset:32768
+ v_accvgpr_write_b32 a24, v212
+ v_accvgpr_write_b32 a25, v213
+ v_accvgpr_write_b32 a26, v214
+ v_accvgpr_write_b32 a27, v215
+ v_lshl_add_u32 v72, v138, 1, 0
+ ds_read_b128 v[144:147], v72 offset:32768
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[126:127], v[66:67], a[24:27]
+ v_lshl_add_u32 v70, v139, 1, 0
+ ds_read_b128 v[148:151], v70 offset:32768
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[188:191], v[128:129], v[68:69], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[20:23], v[140:141], v[66:67], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[142:143], v[68:69], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a20, v216
+ v_accvgpr_write_b32 a21, v217
+ v_accvgpr_write_b32 a22, v218
+ v_accvgpr_write_b32 a23, v219
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[20:23], v[126:127], v[90:91], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[128:129], v[92:93], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[140:141], v[90:91], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[20:23], v[142:143], v[92:93], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a8, v152
+ v_accvgpr_write_b32 a9, v153
+ v_accvgpr_write_b32 a10, v154
+ v_accvgpr_write_b32 a11, v155
+ s_waitcnt lgkmcnt(1)
+ s_nop 0
+ v_mfma_f32_16x16x16_f16 a[8:11], v[144:145], v[66:67], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[160:163], v[146:147], v[68:69], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[148:149], v[66:67], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[150:151], v[68:69], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 1
+ v_accvgpr_write_b32 a16, v220
+ v_accvgpr_write_b32 a17, v221
+ v_accvgpr_write_b32 a18, v222
+ v_mfma_f32_16x16x16_f16 a[12:15], v[144:145], v[90:91], a[12:15]
+ v_accvgpr_write_b32 a19, v223
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[12:15], v[146:147], v[92:93], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_mfma_f32_16x16x16_f16 a[16:19], v[148:149], v[90:91], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[150:151], v[92:93], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[52:55], v[126:127], v[118:119], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[184:187], v[128:129], v[120:121], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a52, v224
+ v_accvgpr_write_b32 a53, v225
+ v_accvgpr_write_b32 a54, v226
+ v_accvgpr_write_b32 a55, v227
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[52:55], v[140:141], v[118:119], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[140:143], v[142:143], v[120:121], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[126:127], v[122:123], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[144:147], v[128:129], v[124:125], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a48, v228
+ v_accvgpr_write_b32 a49, v229
+ v_accvgpr_write_b32 a50, v230
+ v_accvgpr_write_b32 a51, v231
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[48:51], v[140:141], v[122:123], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[156:159], v[142:143], v[124:125], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a48, v232
+ v_accvgpr_write_b32 a49, v233
+ v_accvgpr_write_b32 a50, v234
+ v_accvgpr_write_b32 a51, v235
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[48:51], v[144:145], v[118:119], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[176:179], v[146:147], v[120:121], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a48, v236
+ v_accvgpr_write_b32 a49, v237
+ v_accvgpr_write_b32 a50, v238
+ v_accvgpr_write_b32 a51, v239
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[48:51], v[148:149], v[118:119], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[180:183], v[150:151], v[120:121], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a48, v240
+ v_accvgpr_write_b32 a49, v241
+ v_accvgpr_write_b32 a50, v242
+ v_accvgpr_write_b32 a51, v243
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[48:51], v[144:145], v[122:123], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[148:151], v[146:147], v[124:125], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a48, v244
+ v_accvgpr_write_b32 a49, v245
+ v_accvgpr_write_b32 a50, v246
+ v_accvgpr_write_b32 a51, v247
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[48:51], v[148:149], v[122:123], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[152:155], v[150:151], v[124:125], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000000)
+ v_lshl_add_u32 v69, v136, 1, 0
+ ds_read_b128 v[90:93], v69
+ v_lshl_add_u32 v68, v137, 1, 0
+ ds_read_b128 v[118:121], v68
+ v_lshl_add_u32 v67, v134, 1, 0
+ ds_read_b128 v[122:125], v67
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[86:87], v[90:91], a[132:135]
+ ; sched_barrier mask(0x000007F6)
+ v_lshl_add_u32 v66, v135, 1, 0
+ ds_read_b128 v[134:137], v66
+ v_mfma_f32_16x16x16_f16 a[136:139], v[88:89], v[92:93], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[94:95], v[90:91], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[112:115], v[96:97], v[92:93], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[86:87], v[118:119], a[92:95]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[128:131], v[88:89], v[120:121], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[94:95], v[118:119], a[96:99]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[132:135], v[96:97], v[120:121], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[98:99], v[90:91], a[100:103]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[100:101], v[92:93], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[114:115], v[90:91], a[104:107]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[104:107], v[116:117], v[92:93], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a48, v252
+ v_accvgpr_write_b32 a49, v253
+ v_accvgpr_write_b32 a50, v254
+ v_accvgpr_write_b32 a51, v255
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[48:51], v[98:99], v[118:119], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[252:255], v[100:101], v[120:121], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[114:115], v[118:119], a[108:111]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[108:111], v[116:117], v[120:121], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[44:47], v[86:87], v[122:123], a[44:47]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[44:47], v[88:89], v[124:125], a[44:47]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[94:95], v[122:123], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[96:97], v[124:125], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 4
+ v_accvgpr_read_b32 v155, a47
+ v_accvgpr_read_b32 v154, a46
+ v_accvgpr_read_b32 v153, a45
+ v_accvgpr_read_b32 v152, a44
+ v_accvgpr_read_b32 v187, a43
+ v_accvgpr_read_b32 v186, a42
+ v_accvgpr_read_b32 v185, a41
+ v_accvgpr_read_b32 v184, a40
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[86:87], v[134:135], a[88:91]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[88:89], v[136:137], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v89, a43
+ v_accvgpr_read_b32 v88, a42
+ v_accvgpr_read_b32 v87, a41
+ v_accvgpr_read_b32 v86, a40
+ v_mfma_f32_16x16x16_f16 a[40:43], v[94:95], v[134:135], a[80:83]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[240:243], v[96:97], v[136:137], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[98:99], v[122:123], a[124:127]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[100:101], v[124:125], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v97, a43
+ v_accvgpr_read_b32 v96, a42
+ v_accvgpr_read_b32 v95, a41
+ v_accvgpr_read_b32 v94, a40
+ v_mfma_f32_16x16x16_f16 a[40:43], v[114:115], v[122:123], a[76:79]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[192:195], v[116:117], v[124:125], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[98:99], v[134:135], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[196:199], v[100:101], v[136:137], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[114:115], v[134:135], a[84:87]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[200:203], v[116:117], v[136:137], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000000)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[126:127], v[90:91], a[220:223]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[80:83], v[128:129], v[92:93], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[140:141], v[90:91], a[120:123]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[84:87], v[142:143], v[92:93], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[126:127], v[118:119], a[224:227]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[128:129], v[120:121], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[140:141], v[118:119], a[232:235]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[76:79], v[142:143], v[120:121], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[144:145], v[90:91], a[72:75]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[72:75], v[146:147], v[92:93], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[148:149], v[90:91], a[68:71]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[68:71], v[150:151], v[92:93], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[144:145], v[118:119], a[64:67]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[64:67], v[146:147], v[120:121], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[148:149], v[118:119], a[60:63]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[60:63], v[150:151], v[120:121], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[126:127], v[122:123], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[96:99], v[128:129], v[124:125], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[32:35], v[140:141], v[122:123], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[92:95], v[142:143], v[124:125], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[126:127], v[134:135], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[220:223], v[128:129], v[136:137], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[140:141], v[134:135], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[224:227], v[142:143], v[136:137], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[144:145], v[122:123], a[116:119]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[232:235], v[146:147], v[124:125], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[148:149], v[122:123], a[164:167]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[116:119], v[150:151], v[124:125], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[144:145], v[134:135], a[172:175]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[146:147], v[136:137], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[148:149], v[134:135], a[168:171]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[124:127], v[150:151], v[136:137], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000000)
+ s_barrier
+ s_waitcnt vmcnt(9)
+ ds_write_b128 v132, v[54:57]
+ s_waitcnt vmcnt(8)
+ ds_write_b128 v132, v[50:53] offset:4096
+ ds_write_b128 v132, v[46:49] offset:8192
+ ds_write_b128 v132, v[58:61] offset:12288
+ ds_write_b128 v132, v[38:41] offset:16384
+ ds_write_b128 v132, v[62:65] offset:20480
+ ds_write_b128 v132, v[34:37] offset:24576
+ ds_write_b128 v132, v[26:29] offset:28672
+ s_waitcnt vmcnt(7)
+ ds_write_b128 v132, v[30:33] offset:32768
+ s_waitcnt vmcnt(6)
+ ds_write_b128 v131, v[22:25] offset:4096
+ s_waitcnt vmcnt(5)
+ ds_write_b128 v131, v[18:21] offset:8192
+ s_waitcnt vmcnt(4)
+ ds_write_b128 v131, v[14:17] offset:12288
+ s_waitcnt vmcnt(3)
+ ds_write_b128 v131, v[10:13] offset:16384
+ s_waitcnt vmcnt(2)
+ ds_write_b128 v131, v[6:9] offset:20480
+ s_waitcnt vmcnt(1)
+ ds_write_b128 v131, v[2:5] offset:24576
+ s_waitcnt vmcnt(0)
+ ds_write_b128 v131, v[42:45] offset:28672
+ ; sched_barrier mask(0x00000000)
+ s_waitcnt lgkmcnt(0)
+ s_barrier
+ ds_read_b128 v[2:5], v133 offset:32768
+ ds_read_b128 v[6:9], v1
+ ds_read_b128 v[10:13], v0 offset:4096
+ ds_read_b128 v[14:17], v1 offset:4096
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[2:3], v[6:7], a[204:207]
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[18:21], v0 offset:8192
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[22:25], v0 offset:12288
+ ds_read_b128 v[26:29], v110 offset:32768
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[30:33], v111 offset:4096
+ ds_read_b128 v[34:37], v112 offset:32768
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[38:41], v113 offset:4096
+ v_mfma_f32_16x16x16_f16 a[36:39], v[4:5], v[8:9], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(7)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[10:11], v[6:7], a[208:211]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[120:123], v[12:13], v[8:9], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(6)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[2:3], v[14:15], a[212:215]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[208:211], v[4:5], v[16:17], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[10:11], v[14:15], a[216:219]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[40:43], v[12:13], v[16:17], a[4:7]
+ s_waitcnt lgkmcnt(5)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[18:19], v[6:7], a[244:247]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[44:47], v[20:21], v[8:9], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(4)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[22:23], v[6:7], a[228:231]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[24:25], v[8:9], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[18:19], v[14:15], a[248:251]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[20:21], v[16:17], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[32:35], v[22:23], v[14:15], a[236:239]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[88:91], v[24:25], v[16:17], a[32:35]
+ s_waitcnt lgkmcnt(3)
+ v_mfma_f32_16x16x16_f16 a[32:35], v[26:27], v[6:7], a[188:191]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[188:191], v[28:29], v[8:9], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[30:31], v[6:7], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[164:167], v[32:33], v[8:9], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[24:27], v[26:27], v[14:15], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[204:207], v[28:29], v[16:17], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[20:23], v[30:31], v[14:15], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[172:175], v[32:33], v[16:17], a[20:23]
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[20:23], v[34:35], v[6:7], a[160:163]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[168:171], v[36:37], v[8:9], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[38:39], v[6:7], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[160:163], v[40:41], v[8:9], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[34:35], v[14:15], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[244:247], v[36:37], v[16:17], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[38:39], v[14:15], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[236:239], v[40:41], v[16:17], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000000)
+ ds_read_b128 v[6:9], v1 offset:8192
+ ds_read_b128 v[14:17], v1 offset:12288
+ s_nop 3
+ v_accvgpr_write_b32 a8, v102
+ v_accvgpr_write_b32 a9, v103
+ v_accvgpr_write_b32 a10, v104
+ v_accvgpr_write_b32 a11, v105
+ v_accvgpr_write_b32 a12, v156
+ v_accvgpr_write_b32 a13, v157
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[2:3], v[6:7], a[8:11]
+ v_accvgpr_write_b32 a14, v158
+ v_accvgpr_write_b32 a15, v159
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a16, v160
+ v_mfma_f32_16x16x16_f16 a[8:11], v[4:5], v[8:9], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a17, v161
+ v_accvgpr_write_b32 a18, v162
+ v_accvgpr_write_b32 a19, v163
+ v_mfma_f32_16x16x16_f16 a[12:15], v[10:11], v[6:7], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a20, v164
+ v_accvgpr_write_b32 a21, v165
+ v_accvgpr_write_b32 a22, v166
+ v_mfma_f32_16x16x16_f16 a[12:15], v[12:13], v[8:9], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a23, v167
+ v_accvgpr_write_b32 a24, v168
+ v_accvgpr_write_b32 a25, v169
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[2:3], v[14:15], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a26, v170
+ v_accvgpr_write_b32 a27, v171
+ v_accvgpr_write_b32 a28, v172
+ v_mfma_f32_16x16x16_f16 a[16:19], v[4:5], v[16:17], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a29, v173
+ v_accvgpr_write_b32 a30, v174
+ v_accvgpr_write_b32 a31, v175
+ v_mfma_f32_16x16x16_f16 a[20:23], v[10:11], v[14:15], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a32, v176
+ v_accvgpr_write_b32 a33, v177
+ v_accvgpr_write_b32 a34, v178
+ v_mfma_f32_16x16x16_f16 a[20:23], v[12:13], v[16:17], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a35, v179
+ v_accvgpr_write_b32 a52, v180
+ v_accvgpr_write_b32 a53, v181
+ v_mfma_f32_16x16x16_f16 a[24:27], v[18:19], v[6:7], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a54, v182
+ v_accvgpr_write_b32 a55, v183
+ v_mfma_f32_16x16x16_f16 a[24:27], v[20:21], v[8:9], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[22:23], v[6:7], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[24:25], v[8:9], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[32:35], v[18:19], v[14:15], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[32:35], v[20:21], v[16:17], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[52:55], v[22:23], v[14:15], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[52:55], v[24:25], v[16:17], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[184:187], v[26:27], v[6:7], a[184:187]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[228:231], v[28:29], v[8:9], a[184:187]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[140:143], v[30:31], v[6:7], a[140:143]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[140:143], v[32:33], v[8:9], a[140:143]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[144:147], v[26:27], v[14:15], a[144:147]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[144:147], v[28:29], v[16:17], a[144:147]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[156:159], v[30:31], v[14:15], a[156:159]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[156:159], v[32:33], v[16:17], a[156:159]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[176:179], v[34:35], v[6:7], a[176:179]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[176:179], v[36:37], v[8:9], a[176:179]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[180:183], v[38:39], v[6:7], a[180:183]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[180:183], v[40:41], v[8:9], a[180:183]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[148:151], v[34:35], v[14:15], a[148:151]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[148:151], v[36:37], v[16:17], a[148:151]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[152:155], v[38:39], v[14:15], a[152:155]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[248:251], v[40:41], v[16:17], a[152:155]
+ ; sched_barrier mask(0x000007F6)
+ ; sched_barrier mask(0x00000000)
+ ds_read_b128 v[6:9], v82
+ ds_read_b128 v[14:17], v84
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[136:139], v[2:3], v[6:7], a[136:139]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[216:219], v[4:5], v[8:9], a[136:139]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[112:115], v[10:11], v[6:7], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[212:215], v[12:13], v[8:9], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[112:115], v[2:3], v[14:15], a[128:131]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[112:115], v[4:5], v[16:17], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[128:131], v[10:11], v[14:15], a[132:135]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[136:139], v[12:13], v[16:17], a[128:131]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[18:19], v[6:7], a[100:103]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[128:131], v[20:21], v[8:9], a[100:103]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[22:23], v[6:7], a[104:107]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[104:107], v[24:25], v[8:9], a[100:103]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[18:19], v[14:15], a[252:255]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[100:103], v[20:21], v[16:17], a[100:103]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[108:111], v[22:23], v[14:15], a[108:111]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[108:111], v[24:25], v[16:17], a[108:111]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[80:83], v[26:27], v[6:7], a[80:83]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[132:135], v[28:29], v[8:9], a[80:83]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[80:83], v[30:31], v[6:7], a[84:87]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[252:255], v[32:33], v[8:9], a[80:83]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[26:27], v[14:15], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[28:29], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v115, a59
+ v_accvgpr_read_b32 v114, a58
+ v_accvgpr_read_b32 v113, a57
+ v_accvgpr_read_b32 v112, a56
+ v_mfma_f32_16x16x16_f16 a[56:59], v[30:31], v[14:15], a[76:79]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[84:87], v[32:33], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[34:35], v[6:7], a[72:75]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[36:37], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v119, a59
+ v_accvgpr_read_b32 v118, a58
+ v_accvgpr_read_b32 v117, a57
+ v_accvgpr_read_b32 v116, a56
+ v_mfma_f32_16x16x16_f16 a[56:59], v[38:39], v[6:7], a[68:71]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[40:41], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v237, a59
+ v_accvgpr_read_b32 v236, a58
+ v_accvgpr_read_b32 v235, a57
+ v_accvgpr_read_b32 v234, a56
+ v_mfma_f32_16x16x16_f16 a[56:59], v[34:35], v[14:15], a[64:67]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[36:37], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v233, a59
+ v_accvgpr_read_b32 v232, a58
+ v_accvgpr_read_b32 v231, a57
+ v_accvgpr_read_b32 v230, a56
+ v_mfma_f32_16x16x16_f16 a[56:59], v[38:39], v[14:15], a[60:63]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[40:41], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v229, a59
+ v_accvgpr_read_b32 v228, a58
+ v_accvgpr_read_b32 v227, a57
+ v_accvgpr_read_b32 v226, a56
+ ; sched_barrier mask(0x00000000)
+ ds_read_b128 v[6:9], v83
+ ds_read_b128 v[14:17], v85
+ v_accvgpr_write_b32 a56, v152
+ v_accvgpr_write_b32 a57, v153
+ v_accvgpr_write_b32 a58, v154
+ v_accvgpr_write_b32 a59, v155
+ s_waitcnt lgkmcnt(1)
+ s_nop 0
+ v_mfma_f32_16x16x16_f16 a[56:59], v[2:3], v[6:7], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[60:63], v[4:5], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a56, v184
+ v_accvgpr_write_b32 a57, v185
+ v_accvgpr_write_b32 a58, v186
+ v_accvgpr_write_b32 a59, v187
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[56:59], v[10:11], v[6:7], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[64:67], v[12:13], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a56, v86
+ v_accvgpr_write_b32 a57, v87
+ v_accvgpr_write_b32 a58, v88
+ v_accvgpr_write_b32 a59, v89
+ s_waitcnt lgkmcnt(0)
+ s_nop 0
+ v_mfma_f32_16x16x16_f16 a[56:59], v[2:3], v[14:15], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[184:187], v[4:5], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[10:11], v[14:15], a[240:243]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[80:83], v[12:13], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 5
+ v_accvgpr_write_b32 a56, v94
+ v_accvgpr_write_b32 a57, v95
+ v_accvgpr_write_b32 a58, v96
+ v_accvgpr_write_b32 a59, v97
+ s_nop 1
+ v_mfma_f32_16x16x16_f16 a[56:59], v[18:19], v[6:7], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[152:155], v[20:21], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[22:23], v[6:7], a[192:195]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[72:75], v[24:25], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[18:19], v[14:15], a[196:199]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[68:71], v[20:21], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[22:23], v[14:15], a[200:203]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[76:79], v[24:25], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[26:27], v[6:7], a[96:99]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[240:243], v[28:29], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[30:31], v[6:7], a[92:95]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[196:199], v[32:33], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[26:27], v[14:15], a[220:223]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[192:195], v[28:29], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[30:31], v[14:15], a[224:227]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[220:223], v[32:33], v[16:17], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[34:35], v[6:7], a[232:235]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[200:203], v[36:37], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[38:39], v[6:7], a[116:119]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[116:119], v[40:41], v[8:9], a[56:59]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[34:35], v[14:15], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[92:95], v[36:37], v[16:17], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[38:39], v[14:15], a[124:127]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[40:41], v[16:17], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 6
+ v_accvgpr_read_b32 v105, a3
+ v_accvgpr_read_b32 v104, a2
+ v_accvgpr_read_b32 v103, a1
+ v_accvgpr_read_b32 v102, a0
+ ; sched_barrier mask(0x00000000)
+ ds_read_b128 v[20:23], v80 offset:32768
+ ds_read_b128 v[32:35], v81
+ ds_read_b128 v[28:31], v76 offset:4096
+ ds_read_b128 v[36:39], v75
+ ds_read_b128 v[16:19], v79 offset:32768
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[24:27], v77 offset:4096
+ ds_read_b128 v[46:49], v78
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[58:61], v74
+ s_waitcnt lgkmcnt(6)
+ v_mfma_f32_16x16x16_f16 a[0:3], v[20:21], v[32:33], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[8:11], v73 offset:32768
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[12:15], v71 offset:32768
+ ds_read_b128 v[4:7], v72 offset:32768
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[0:3], v70 offset:32768
+ ds_read_b128 v[218:221], v69
+ ds_read_b128 v[222:225], v68
+ v_mfma_f32_16x16x16_f16 a[0:3], v[22:23], v[34:35], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ ds_read_b128 v[252:255], v67
+ s_mul_i32 s0, s15, s4
+ s_ashr_i32 s1, s0, 31
+ s_waitcnt lgkmcnt(12)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[28:29], v[32:33], a[120:123]
+ ; sched_barrier mask(0x000007F6)
+ s_lshl_b64 s[0:1], s[0:1], 1
+ s_add_u32 s2, s6, s0
+ s_addc_u32 s3, s7, s1
+ v_mfma_f32_16x16x16_f16 a[232:235], v[30:31], v[34:35], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v94, a0
+ v_accvgpr_read_b32 v95, a1
+ v_accvgpr_read_b32 v99, a2
+ s_waitcnt lgkmcnt(11)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[20:21], v[36:37], a[208:211]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v98, a3
+ v_cvt_f16_f32_e32 v94, v94
+ v_cvt_f16_f32_e32 v99, v99
+ v_mfma_f32_16x16x16_f16 a[120:123], v[22:23], v[38:39], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_cvt_f16_f32_e32 v95, v95
+ s_ashr_i32 s13, s12, 31
+ s_lshl_b64 s[0:1], s[12:13], 1
+ v_mfma_f32_16x16x16_f16 a[36:39], v[28:29], v[36:37], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ s_add_u32 s0, s2, s0
+ s_addc_u32 s1, s3, s1
+ v_accvgpr_read_b32 v91, a232
+ v_mfma_f32_16x16x16_f16 a[96:99], v[30:31], v[38:39], a[36:39]
+ v_accvgpr_read_b32 v84, a120
+ v_accvgpr_read_b32 v82, a121
+ v_accvgpr_read_b32 v89, a122
+ s_waitcnt lgkmcnt(10)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[16:17], v[32:33], a[44:47]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v90, a123
+ v_cvt_f16_f32_e32 v84, v84
+ v_cvt_f16_f32_e32 v89, v89
+ v_mfma_f32_16x16x16_f16 a[40:43], v[18:19], v[34:35], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v79, a96
+ v_accvgpr_read_b32 v83, a97
+ v_accvgpr_read_b32 v86, a98
+ s_waitcnt lgkmcnt(9)
+ v_mfma_f32_16x16x16_f16 a[36:39], v[24:25], v[32:33], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v87, a99
+ v_cvt_f16_f32_e32 v90, v90
+ v_cvt_f16_f32_e32 v82, v82
+ v_mfma_f32_16x16x16_f16 a[36:39], v[26:27], v[34:35], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v80, a40
+ v_accvgpr_read_b32 v78, a41
+ v_accvgpr_read_b32 v81, a42
+ v_mfma_f32_16x16x16_f16 a[4:7], v[16:17], v[36:37], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v85, a43
+ v_accvgpr_read_b32 v88, a233
+ v_accvgpr_read_b32 v92, a234
+ v_mfma_f32_16x16x16_f16 a[4:7], v[18:19], v[38:39], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v74, a36
+ v_accvgpr_read_b32 v70, a37
+ v_accvgpr_read_b32 v76, a38
+ v_mfma_f32_16x16x16_f16 a[44:47], v[24:25], v[36:37], a[88:91]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v77, a39
+ v_accvgpr_read_b32 v93, a235
+ s_and_b32 s1, s1, 0xffff
+ s_waitcnt lgkmcnt(8)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[20:21], v[46:47], a[8:11]
+ v_accvgpr_read_b32 v56, a4
+ v_accvgpr_read_b32 v57, a5
+ v_accvgpr_read_b32 v64, a6
+ v_mfma_f32_16x16x16_f16 a[48:51], v[26:27], v[38:39], a[44:47]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v65, a7
+ s_mov_b32 s3, 0x27000
+ s_mov_b32 s2, 0x7ffffffe
+ v_mfma_f32_16x16x16_f16 a[88:91], v[22:23], v[48:49], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_cvt_f16_f32_e32 v86, v86
+ v_cvt_f16_f32_e32 v87, v87
+ v_cvt_f16_f32_e32 v81, v81
+ v_mfma_f32_16x16x16_f16 a[8:11], v[28:29], v[46:47], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v71, a48
+ v_accvgpr_read_b32 v72, a49
+ v_accvgpr_read_b32 v73, a50
+ v_mfma_f32_16x16x16_f16 a[44:47], v[30:31], v[48:49], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v75, a51
+ v_pack_b32_f16 v87, v86, v87
+ v_cvt_f16_f32_e32 v85, v85
+ s_waitcnt lgkmcnt(7)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[20:21], v[58:59], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_cvt_f16_f32_e32 v64, v64
+ v_cvt_f16_f32_e32 v65, v65
+ v_cvt_f16_f32_e32 v76, v76
+ v_mfma_f32_16x16x16_f16 a[12:15], v[22:23], v[60:61], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v40, a47
+ v_cvt_f16_f32_e32 v77, v77
+ v_cvt_f16_f32_e32 v56, v56
+ v_mfma_f32_16x16x16_f16 a[8:11], v[28:29], v[58:59], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[224:227], v[30:31], v[60:61], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 1
+ v_accvgpr_read_b32 v44, a12
+ v_accvgpr_read_b32 v45, a13
+ v_mfma_f32_16x16x16_f16 a[8:11], v[16:17], v[46:47], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[208:211], v[18:19], v[48:49], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v50, a224
+ v_accvgpr_read_b32 v51, a225
+ v_accvgpr_read_b32 v52, a226
+ v_mfma_f32_16x16x16_f16 a[8:11], v[24:25], v[46:47], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v53, a227
+ v_mfma_f32_16x16x16_f16 a[124:127], v[26:27], v[48:49], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v41, a209
+ v_accvgpr_read_b32 v42, a210
+ v_accvgpr_read_b32 v43, a211
+ v_mfma_f32_16x16x16_f16 a[8:11], v[16:17], v[58:59], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[56:59], v[18:19], v[60:61], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 1
+ v_accvgpr_read_b32 v54, a126
+ v_accvgpr_read_b32 v55, a127
+ v_mfma_f32_16x16x16_f16 a[8:11], v[24:25], v[58:59], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(6)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[8:9], v[32:33], a[188:191]
+ v_mfma_f32_16x16x16_f16 a[8:11], v[26:27], v[60:61], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[188:191], v[10:11], v[34:35], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ s_waitcnt lgkmcnt(5)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[12:13], v[32:33], a[164:167]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 3
+ v_accvgpr_read_b32 v62, a8
+ v_mfma_f32_16x16x16_f16 a[24:27], v[14:15], v[34:35], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v63, a9
+ v_accvgpr_read_b32 v68, a10
+ v_accvgpr_read_b32 v69, a11
+ v_mfma_f32_16x16x16_f16 a[16:19], v[8:9], v[36:37], a[204:207]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v128, a188
+ v_accvgpr_read_b32 v133, a189
+ v_accvgpr_read_b32 v134, a190
+ v_mfma_f32_16x16x16_f16 a[20:23], v[10:11], v[38:39], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v149, a24
+ v_accvgpr_read_b32 v150, a25
+ v_accvgpr_read_b32 v151, a26
+ v_mfma_f32_16x16x16_f16 a[16:19], v[12:13], v[36:37], a[172:175]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v161, a27
+ v_accvgpr_read_b32 v135, a191
+ v_mfma_f32_16x16x16_f16 a[172:175], v[14:15], v[38:39], a[16:19]
+ v_accvgpr_read_b32 v162, a20
+ v_accvgpr_read_b32 v163, a21
+ v_accvgpr_read_b32 v180, a22
+ s_waitcnt lgkmcnt(4)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[4:5], v[32:33], a[168:171]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v181, a23
+ v_mfma_f32_16x16x16_f16 a[164:167], v[6:7], v[34:35], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v182, a172
+ v_accvgpr_read_b32 v191, a173
+ v_accvgpr_read_b32 v192, a174
+ s_waitcnt lgkmcnt(3)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[0:1], v[32:33], a[160:163]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v193, a175
+ v_accvgpr_read_b32 v32, a88
+ v_accvgpr_read_b32 v33, a89
+ v_mfma_f32_16x16x16_f16 a[32:35], v[2:3], v[34:35], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v157, a166
+ v_accvgpr_read_b32 v168, a167
+ v_accvgpr_read_b32 v155, a164
+ v_mfma_f32_16x16x16_f16 a[16:19], v[4:5], v[36:37], a[244:247]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v156, a165
+ v_accvgpr_read_b32 v34, a90
+ v_accvgpr_read_b32 v35, a91
+ v_mfma_f32_16x16x16_f16 a[168:171], v[6:7], v[38:39], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v169, a32
+ v_accvgpr_read_b32 v170, a33
+ v_accvgpr_read_b32 v186, a34
+ v_mfma_f32_16x16x16_f16 a[0:3], v[0:1], v[36:37], a[236:239]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v187, a35
+ v_accvgpr_read_b32 v36, a44
+ v_cvt_f16_f32_e32 v32, v32
+ v_mfma_f32_16x16x16_f16 a[52:55], v[2:3], v[38:39], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v38, a45
+ v_accvgpr_read_b32 v39, a46
+ v_accvgpr_read_b32 v188, a168
+ v_mfma_f32_16x16x16_f16 a[0:3], v[8:9], v[46:47], a[228:231]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v197, a169
+ v_accvgpr_read_b32 v198, a170
+ v_accvgpr_read_b32 v199, a171
+ v_mfma_f32_16x16x16_f16 a[0:3], v[10:11], v[48:49], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v209, a52
+ v_accvgpr_read_b32 v210, a53
+ v_accvgpr_read_b32 v211, a54
+ v_mfma_f32_16x16x16_f16 a[16:19], v[12:13], v[46:47], a[140:143]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v212, a55
+ v_accvgpr_read_b32 v37, a208
+ v_mfma_f32_16x16x16_f16 a[160:163], v[14:15], v[48:49], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v96, a0
+ v_accvgpr_read_b32 v97, a1
+ v_accvgpr_read_b32 v100, a2
+ v_mfma_f32_16x16x16_f16 a[16:19], v[8:9], v[58:59], a[144:147]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v101, a3
+ v_mfma_f32_16x16x16_f16 a[16:19], v[10:11], v[60:61], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v110, a161
+ v_accvgpr_read_b32 v140, a162
+ v_accvgpr_read_b32 v141, a163
+ v_mfma_f32_16x16x16_f16 a[4:7], v[12:13], v[58:59], a[156:159]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[14:15], v[60:61], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v142, a16
+ v_accvgpr_read_b32 v143, a17
+ v_accvgpr_read_b32 v144, a18
+ v_mfma_f32_16x16x16_f16 a[4:7], v[4:5], v[46:47], a[176:179]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v145, a19
+ v_mfma_f32_16x16x16_f16 a[48:51], v[6:7], v[48:49], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v152, a28
+ v_accvgpr_read_b32 v153, a29
+ v_accvgpr_read_b32 v154, a30
+ v_mfma_f32_16x16x16_f16 a[4:7], v[0:1], v[46:47], a[180:183]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v46, a14
+ v_accvgpr_read_b32 v164, a31
+ v_accvgpr_read_b32 v47, a124
+ v_mfma_f32_16x16x16_f16 a[96:99], v[2:3], v[48:49], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v49, a15
+ v_accvgpr_read_b32 v123, a48
+ v_accvgpr_read_b32 v124, a49
+ v_mfma_f32_16x16x16_f16 a[4:7], v[4:5], v[58:59], a[148:151]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v129, a50
+ v_accvgpr_read_b32 v131, a51
+ v_accvgpr_read_b32 v48, a125
+ v_mfma_f32_16x16x16_f16 a[120:123], v[6:7], v[60:61], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v132, a96
+ v_accvgpr_read_b32 v137, a97
+ v_accvgpr_read_b32 v138, a98
+ v_mfma_f32_16x16x16_f16 a[4:7], v[0:1], v[58:59], a[248:251]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v58, a56
+ v_accvgpr_read_b32 v59, a57
+ v_accvgpr_read_b32 v139, a99
+ v_mfma_f32_16x16x16_f16 a[140:143], v[2:3], v[60:61], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v60, a58
+ v_accvgpr_read_b32 v61, a59
+ v_accvgpr_read_b32 v158, a120
+ s_waitcnt lgkmcnt(2)
+ v_mfma_f32_16x16x16_f16 a[4:7], v[20:21], v[218:219], a[216:219]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v159, a121
+ v_accvgpr_read_b32 v160, a122
+ v_accvgpr_read_b32 v171, a123
+ v_mfma_f32_16x16x16_f16 a[12:15], v[22:23], v[220:221], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v172, a140
+ v_accvgpr_read_b32 v173, a141
+ v_accvgpr_read_b32 v189, a142
+ v_mfma_f32_16x16x16_f16 a[4:7], v[28:29], v[218:219], a[212:215]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v190, a143
+ v_mfma_f32_16x16x16_f16 a[4:7], v[30:31], v[220:221], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v136, a12
+ v_accvgpr_read_b32 v146, a13
+ v_accvgpr_read_b32 v147, a14
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[8:11], v[20:21], v[222:223], a[112:115]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v148, a15
+ v_mfma_f32_16x16x16_f16 a[8:11], v[22:23], v[224:225], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v165, a4
+ v_accvgpr_read_b32 v166, a5
+ v_accvgpr_read_b32 v167, a6
+ v_mfma_f32_16x16x16_f16 a[24:27], v[28:29], v[222:223], a[136:139]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v177, a7
+ v_mfma_f32_16x16x16_f16 a[20:23], v[30:31], v[224:225], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v178, a8
+ v_accvgpr_read_b32 v179, a9
+ v_accvgpr_read_b32 v194, a10
+ v_mfma_f32_16x16x16_f16 a[24:27], v[16:17], v[218:219], a[128:131]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v195, a11
+ v_mfma_f32_16x16x16_f16 a[24:27], v[18:19], v[220:221], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v196, a20
+ v_accvgpr_read_b32 v203, a21
+ v_accvgpr_read_b32 v204, a22
+ v_mfma_f32_16x16x16_f16 a[36:39], v[24:25], v[218:219], a[104:107]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v205, a23
+ v_mfma_f32_16x16x16_f16 a[32:35], v[26:27], v[220:221], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v174, a24
+ v_accvgpr_read_b32 v175, a25
+ v_accvgpr_read_b32 v176, a26
+ v_mfma_f32_16x16x16_f16 a[36:39], v[16:17], v[222:223], a[100:103]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v183, a27
+ v_mfma_f32_16x16x16_f16 a[36:39], v[18:19], v[224:225], a[36:39]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v184, a32
+ v_accvgpr_read_b32 v185, a33
+ v_accvgpr_read_b32 v200, a34
+ v_mfma_f32_16x16x16_f16 a[40:43], v[24:25], v[222:223], a[108:111]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v201, a35
+ v_accvgpr_write_b32 a32, v234
+ v_accvgpr_write_b32 a33, v235
+ v_mfma_f32_16x16x16_f16 a[0:3], v[26:27], v[224:225], a[40:43]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a34, v236
+ v_accvgpr_write_b32 a35, v237
+ v_accvgpr_read_b32 v202, a36
+ ds_read_b128 a[40:43], v66
+ s_waitcnt lgkmcnt(1)
+ v_mfma_f32_16x16x16_f16 a[44:47], v[20:21], v[252:253], a[60:63]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v67, a160
+ v_accvgpr_read_b32 v206, a37
+ v_accvgpr_read_b32 v213, a0
+ v_mfma_f32_16x16x16_f16 a[44:47], v[22:23], v[254:255], a[44:47]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v214, a1
+ v_accvgpr_read_b32 v215, a2
+ v_accvgpr_read_b32 v216, a3
+ v_mfma_f32_16x16x16_f16 a[16:19], v[28:29], v[252:253], a[64:67]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a0, v112
+ v_accvgpr_write_b32 a1, v113
+ v_accvgpr_write_b32 a2, v114
+ v_mfma_f32_16x16x16_f16 a[16:19], v[30:31], v[254:255], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a3, v115
+ v_accvgpr_read_b32 v207, a38
+ v_accvgpr_read_b32 v208, a39
+ s_waitcnt lgkmcnt(0)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[20:21], a[40:41], a[184:187]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[28:31], v[22:23], a[42:43], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v20, a16
+ v_accvgpr_read_b32 v22, a17
+ v_accvgpr_read_b32 v23, a18
+ v_mfma_f32_16x16x16_f16 a[48:51], v[28:29], a[40:41], a[80:83]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[48:51], v[30:31], a[42:43], a[48:51]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v28, a28
+ v_accvgpr_read_b32 v29, a29
+ v_accvgpr_read_b32 v30, a30
+ v_mfma_f32_16x16x16_f16 a[52:55], v[16:17], v[252:253], a[152:155]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v112, a31
+ v_accvgpr_write_b32 a28, v116
+ v_accvgpr_write_b32 a29, v117
+ v_mfma_f32_16x16x16_f16 a[52:55], v[18:19], v[254:255], a[52:55]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a30, v118
+ v_accvgpr_write_b32 a31, v119
+ v_accvgpr_read_b32 v113, a48
+ v_mfma_f32_16x16x16_f16 a[12:15], v[24:25], v[252:253], a[72:75]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v114, a49
+ v_accvgpr_read_b32 v118, a50
+ v_accvgpr_read_b32 v119, a51
+ v_mfma_f32_16x16x16_f16 a[12:15], v[26:27], v[254:255], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v21, a52
+ v_mfma_f32_16x16x16_f16 a[4:7], v[16:17], a[40:41], a[68:71]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v16, a44
+ v_accvgpr_read_b32 v17, a45
+ v_cvt_f16_f32_e32 v16, v16
+ v_mfma_f32_16x16x16_f16 a[4:7], v[18:19], a[42:43], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v31, a12
+ v_accvgpr_read_b32 v66, a13
+ v_accvgpr_read_b32 v111, a14
+ v_mfma_f32_16x16x16_f16 a[8:11], v[24:25], a[40:41], a[76:79]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v24, a19
+ v_accvgpr_read_b32 v115, a15
+ v_accvgpr_write_b32 a12, v230
+ v_mfma_f32_16x16x16_f16 a[8:11], v[26:27], a[42:43], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a13, v231
+ v_accvgpr_write_b32 a14, v232
+ v_accvgpr_write_b32 a15, v233
+ v_mfma_f32_16x16x16_f16 a[20:23], v[8:9], v[218:219], a[132:135]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v116, a4
+ v_accvgpr_read_b32 v117, a5
+ v_accvgpr_read_b32 v120, a6
+ v_mfma_f32_16x16x16_f16 a[20:23], v[10:11], v[220:221], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v122, a8
+ v_accvgpr_read_b32 v125, a9
+ v_accvgpr_read_b32 v126, a10
+ v_mfma_f32_16x16x16_f16 a[24:27], v[12:13], v[218:219], a[252:255]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v127, a11
+ v_accvgpr_write_b32 a8, v226
+ v_accvgpr_write_b32 a9, v227
+ v_mfma_f32_16x16x16_f16 a[24:27], v[14:15], v[220:221], a[24:27]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a10, v228
+ v_accvgpr_write_b32 a11, v229
+ v_accvgpr_read_b32 v121, a7
+ v_mfma_f32_16x16x16_f16 a[0:3], v[8:9], v[222:223], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v18, a46
+ v_accvgpr_read_b32 v19, a47
+ v_accvgpr_read_b32 v25, a53
+ v_mfma_f32_16x16x16_f16 a[0:3], v[10:11], v[224:225], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v227, a26
+ v_accvgpr_read_b32 v228, a27
+ v_accvgpr_read_b32 v26, a54
+ v_mfma_f32_16x16x16_f16 a[16:19], v[12:13], v[222:223], a[84:87]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v27, a55
+ v_mfma_f32_16x16x16_f16 a[16:19], v[14:15], v[224:225], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ s_nop 0
+ v_accvgpr_read_b32 v229, a0
+ v_accvgpr_read_b32 v233, a1
+ v_accvgpr_read_b32 v234, a2
+ v_mfma_f32_16x16x16_f16 a[28:31], v[4:5], v[218:219], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v235, a3
+ v_mfma_f32_16x16x16_f16 a[28:31], v[6:7], v[220:221], a[28:31]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v239, a16
+ v_accvgpr_read_b32 v240, a17
+ v_accvgpr_read_b32 v241, a18
+ v_mfma_f32_16x16x16_f16 a[32:35], v[0:1], v[218:219], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v245, a19
+ v_accvgpr_read_b32 v219, a20
+ v_mfma_f32_16x16x16_f16 a[32:35], v[2:3], v[220:221], a[32:35]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v220, a21
+ v_accvgpr_read_b32 v221, a22
+ v_accvgpr_read_b32 v226, a29
+ v_mfma_f32_16x16x16_f16 a[12:15], v[4:5], v[222:223], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v230, a30
+ v_accvgpr_read_b32 v231, a31
+ v_mfma_f32_16x16x16_f16 a[4:7], v[6:7], v[224:225], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v232, a32
+ v_accvgpr_read_b32 v236, a33
+ v_accvgpr_read_b32 v237, a34
+ v_mfma_f32_16x16x16_f16 a[8:11], v[0:1], v[222:223], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v222, a23
+ v_accvgpr_read_b32 v223, a24
+ v_accvgpr_read_b32 v238, a35
+ v_mfma_f32_16x16x16_f16 a[8:11], v[2:3], v[224:225], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v242, a4
+ v_accvgpr_read_b32 v243, a5
+ v_accvgpr_read_b32 v244, a6
+ v_mfma_f32_16x16x16_f16 a[12:15], v[8:9], v[252:253], a[240:243]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v246, a7
+ v_accvgpr_read_b32 v224, a25
+ v_accvgpr_read_b32 v225, a28
+ v_mfma_f32_16x16x16_f16 a[12:15], v[10:11], v[254:255], a[12:15]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v247, a8
+ v_accvgpr_read_b32 v248, a9
+ v_accvgpr_read_b32 v249, a10
+ v_mfma_f32_16x16x16_f16 a[0:3], v[12:13], v[252:253], a[196:199]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v250, a11
+ v_mfma_f32_16x16x16_f16 a[0:3], v[14:15], v[254:255], a[0:3]
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[16:19], v[8:9], a[40:41], a[192:195]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v8, a12
+ v_accvgpr_read_b32 v9, a13
+ v_cvt_f16_f32_e32 v8, v8
+ v_mfma_f32_16x16x16_f16 a[16:19], v[10:11], a[42:43], a[16:19]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v10, a14
+ v_accvgpr_read_b32 v11, a15
+ v_accvgpr_write_b32 a12, v102
+ v_mfma_f32_16x16x16_f16 a[20:23], v[12:13], a[40:41], a[220:223]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v12, a0
+ v_accvgpr_read_b32 v13, a1
+ v_accvgpr_write_b32 a13, v103
+ v_mfma_f32_16x16x16_f16 a[20:23], v[14:15], a[42:43], a[20:23]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v14, a2
+ v_accvgpr_read_b32 v15, a3
+ v_accvgpr_write_b32 a14, v104
+ v_mfma_f32_16x16x16_f16 a[4:7], v[4:5], v[252:253], a[200:203]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_write_b32 a15, v105
+ v_or_b32_e32 v102, 0x80, v108
+ v_mov_b32_e32 v103, v102
+ v_mfma_f32_16x16x16_f16 a[4:7], v[6:7], v[254:255], a[4:7]
+ ; sched_barrier mask(0x000007F6)
+ v_cvt_f16_f32_e32 v9, v9
+ v_cvt_f16_f32_e32 v10, v10
+ v_cvt_f16_f32_e32 v11, v11
+ v_mfma_f32_16x16x16_f16 a[8:11], v[0:1], v[252:253], a[116:119]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v217, a16
+ v_accvgpr_read_b32 v218, a17
+ v_cvt_f16_f32_e32 v12, v12
+ v_mfma_f32_16x16x16_f16 a[8:11], v[2:3], v[254:255], a[8:11]
+ ; sched_barrier mask(0x000007F6)
+ v_cvt_f16_f32_e32 v13, v13
+ v_cvt_f16_f32_e32 v14, v14
+ v_cvt_f16_f32_e32 v15, v15
+ v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], a[40:41], a[92:95]
+ ; sched_barrier mask(0x000007F6)
+ v_accvgpr_read_b32 v4, a18
+ v_accvgpr_read_b32 v5, a19
+ v_mfma_f32_16x16x16_f16 a[0:3], v[6:7], a[42:43], a[0:3]
+ v_cvt_f16_f32_e32 v6, v98
+ ; sched_barrier mask(0x000007F6)
+ v_cvt_f16_f32_e32 v98, v162
+ v_pack_b32_f16 v7, v99, v6
+ v_mfma_f32_16x16x16_f16 a[12:15], v[0:1], a[40:41], a[12:15]
+ v_pack_b32_f16 v6, v94, v95
+ v_mul_lo_u32 v94, s4, v130
+ v_add_lshl_u32 v1, v94, v108, 1
+ ; sched_barrier mask(0x000007F6)
+ v_mfma_f32_16x16x16_f16 a[12:15], v[2:3], a[42:43], a[12:15]
+ ; sched_barrier mask(0x00000406)
+ ; sched_barrier mask(0x00000406)
+ buffer_store_dwordx2 v[6:7], v1, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v1, v91
+ v_cvt_f16_f32_e32 v2, v92
+ v_cvt_f16_f32_e32 v3, v93
+ v_cvt_f16_f32_e32 v6, v88
+ v_pack_b32_f16 v93, v89, v90
+ v_pack_b32_f16 v92, v84, v82
+ v_cvt_f16_f32_e32 v84, v79
+ v_cvt_f16_f32_e32 v90, v83
+ s_lshl_b32 s4, s4, 5
+ v_add_u32_e32 v89, s4, v94
+ v_pack_b32_f16 v3, v2, v3
+ v_pack_b32_f16 v2, v1, v6
+ v_add_lshl_u32 v6, v94, v109, 1
+ v_add_lshl_u32 v82, v89, v108, 1
+ v_pack_b32_f16 v86, v84, v90
+ v_add_lshl_u32 v90, v89, v109, 1
+ buffer_store_dwordx2 v[2:3], v6, s[0:3], 0 offen
+ buffer_store_dwordx2 v[92:93], v82, s[0:3], 0 offen
+ buffer_store_dwordx2 v[86:87], v90, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v87, v80
+ v_cvt_f16_f32_e32 v90, v78
+ v_pack_b32_f16 v93, v81, v85
+ v_add_lshl_u32 v85, v94, v107, 1
+ v_cvt_f16_f32_e32 v95, v161
+ v_pack_b32_f16 v92, v87, v90
+ v_cvt_f16_f32_e32 v87, v74
+ v_cvt_f16_f32_e32 v90, v70
+ buffer_store_dwordx2 v[92:93], v85, s[0:3], 0 offen
+ v_pack_b32_f16 v93, v76, v77
+ v_add_lshl_u32 v77, v94, v106, 1
+ v_pack_b32_f16 v92, v87, v90
+ v_cvt_f16_f32_e32 v87, v57
+ v_pack_b32_f16 v57, v64, v65
+ v_cvt_f16_f32_e32 v64, v71
+ v_cvt_f16_f32_e32 v71, v72
+ v_cvt_f16_f32_e32 v65, v73
+ v_cvt_f16_f32_e32 v72, v75
+ v_pack_b32_f16 v56, v56, v87
+ v_add_lshl_u32 v73, v89, v107, 1
+ v_pack_b32_f16 v64, v64, v71
+ v_pack_b32_f16 v65, v65, v72
+ v_add_lshl_u32 v71, v89, v106, 1
+ buffer_store_dwordx2 v[92:93], v77, s[0:3], 0 offen
+ buffer_store_dwordx2 v[56:57], v73, s[0:3], 0 offen
+ buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v64, v128
+ v_cvt_f16_f32_e32 v71, v133
+ v_cvt_f16_f32_e32 v65, v134
+ v_cvt_f16_f32_e32 v72, v135
+ v_cvt_f16_f32_e32 v73, v149
+ v_cvt_f16_f32_e32 v75, v150
+ v_cvt_f16_f32_e32 v93, v151
+ v_cvt_f16_f32_e32 v99, v163
+ v_cvt_f16_f32_e32 v128, v180
+ v_cvt_f16_f32_e32 v130, v181
+ v_cvt_f16_f32_e32 v133, v182
+ v_cvt_f16_f32_e32 v134, v191
+ v_cvt_f16_f32_e32 v135, v192
+ v_cvt_f16_f32_e32 v149, v193
+ v_pack_b32_f16 v65, v65, v72
+ v_pack_b32_f16 v64, v64, v71
+ v_add_lshl_u32 v71, v94, v102, 1
+ v_or_b32_e32 v72, 0xa0, v108
+ buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen
+ v_pack_b32_f16 v65, v93, v95
+ v_pack_b32_f16 v64, v73, v75
+ v_add_lshl_u32 v71, v94, v72, 1
+ buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen
+ v_pack_b32_f16 v65, v128, v130
+ v_pack_b32_f16 v64, v98, v99
+ v_add_lshl_u32 v71, v89, v102, 1
+ buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen
+ v_pack_b32_f16 v65, v135, v149
+ v_pack_b32_f16 v64, v133, v134
+ v_add_lshl_u32 v71, v89, v72, 1
+ v_mov_b32_e32 v102, v72
+ buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v65, v157
+ v_cvt_f16_f32_e32 v72, v168
+ v_cvt_f16_f32_e32 v64, v155
+ v_cvt_f16_f32_e32 v71, v156
+ v_cvt_f16_f32_e32 v73, v169
+ v_cvt_f16_f32_e32 v75, v170
+ v_cvt_f16_f32_e32 v93, v186
+ v_cvt_f16_f32_e32 v95, v187
+ v_cvt_f16_f32_e32 v98, v188
+ v_cvt_f16_f32_e32 v99, v197
+ v_cvt_f16_f32_e32 v128, v198
+ v_cvt_f16_f32_e32 v130, v199
+ v_cvt_f16_f32_e32 v133, v209
+ v_cvt_f16_f32_e32 v134, v210
+ v_cvt_f16_f32_e32 v135, v211
+ v_cvt_f16_f32_e32 v149, v212
+ v_pack_b32_f16 v65, v65, v72
+ v_or_b32_e32 v72, 0xc0, v108
+ v_pack_b32_f16 v64, v64, v71
+ v_add_lshl_u32 v71, v94, v72, 1
+ buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen
+ v_pack_b32_f16 v65, v93, v95
+ v_pack_b32_f16 v64, v73, v75
+ v_add_lshl_u32 v71, v94, v251, 1
+ buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen
+ v_pack_b32_f16 v65, v128, v130
+ v_pack_b32_f16 v64, v98, v99
+ v_add_lshl_u32 v71, v89, v72, 1
+ buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen
+ v_pack_b32_f16 v65, v135, v149
+ v_pack_b32_f16 v64, v133, v134
+ v_add_lshl_u32 v71, v89, v251, 1
+ buffer_store_dwordx2 v[64:65], v71, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v64, v33
+ v_cvt_f16_f32_e32 v33, v34
+ v_cvt_f16_f32_e32 v34, v35
+ v_cvt_f16_f32_e32 v35, v36
+ v_cvt_f16_f32_e32 v36, v38
+ v_cvt_f16_f32_e32 v38, v39
+ v_cvt_f16_f32_e32 v39, v40
+ v_cvt_f16_f32_e32 v40, v44
+ v_cvt_f16_f32_e32 v44, v45
+ v_cvt_f16_f32_e32 v45, v46
+ v_cvt_f16_f32_e32 v46, v49
+ v_add_u32_e32 v77, s4, v89
+ v_cvt_f16_f32_e32 v49, v50
+ v_cvt_f16_f32_e32 v50, v51
+ v_cvt_f16_f32_e32 v51, v52
+ v_cvt_f16_f32_e32 v52, v53
+ v_pack_b32_f16 v33, v33, v34
+ v_pack_b32_f16 v32, v32, v64
+ v_add_lshl_u32 v34, v77, v108, 1
+ v_add_u32_e32 v90, s4, v77
+ ; sched_barrier mask(0x00000406)
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v38, v39
+ v_pack_b32_f16 v32, v35, v36
+ v_add_lshl_u32 v34, v77, v109, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v45, v46
+ v_pack_b32_f16 v32, v40, v44
+ v_add_lshl_u32 v34, v90, v108, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v51, v52
+ v_pack_b32_f16 v32, v49, v50
+ v_add_lshl_u32 v34, v90, v109, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v32, v37
+ v_cvt_f16_f32_e32 v34, v41
+ v_cvt_f16_f32_e32 v33, v42
+ v_cvt_f16_f32_e32 v35, v43
+ v_cvt_f16_f32_e32 v36, v47
+ v_cvt_f16_f32_e32 v37, v48
+ v_cvt_f16_f32_e32 v38, v54
+ v_cvt_f16_f32_e32 v39, v55
+ v_cvt_f16_f32_e32 v40, v58
+ v_cvt_f16_f32_e32 v41, v59
+ v_cvt_f16_f32_e32 v42, v60
+ v_cvt_f16_f32_e32 v43, v61
+ v_cvt_f16_f32_e32 v44, v62
+ v_cvt_f16_f32_e32 v45, v63
+ v_cvt_f16_f32_e32 v46, v68
+ v_cvt_f16_f32_e32 v47, v69
+ v_pack_b32_f16 v33, v33, v35
+ v_pack_b32_f16 v32, v32, v34
+ v_add_lshl_u32 v34, v77, v107, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v38, v39
+ v_pack_b32_f16 v32, v36, v37
+ v_add_lshl_u32 v34, v77, v106, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v42, v43
+ v_pack_b32_f16 v32, v40, v41
+ v_add_lshl_u32 v34, v90, v107, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v46, v47
+ v_pack_b32_f16 v32, v44, v45
+ v_add_lshl_u32 v34, v90, v106, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v32, v96
+ v_cvt_f16_f32_e32 v34, v97
+ v_cvt_f16_f32_e32 v33, v100
+ v_cvt_f16_f32_e32 v35, v101
+ v_cvt_f16_f32_e32 v36, v67
+ v_cvt_f16_f32_e32 v37, v110
+ v_cvt_f16_f32_e32 v38, v140
+ v_cvt_f16_f32_e32 v39, v141
+ v_cvt_f16_f32_e32 v40, v142
+ v_cvt_f16_f32_e32 v41, v143
+ v_cvt_f16_f32_e32 v42, v144
+ v_cvt_f16_f32_e32 v43, v145
+ v_cvt_f16_f32_e32 v44, v152
+ v_cvt_f16_f32_e32 v45, v153
+ v_cvt_f16_f32_e32 v46, v154
+ v_cvt_f16_f32_e32 v47, v164
+ v_pack_b32_f16 v33, v33, v35
+ v_pack_b32_f16 v32, v32, v34
+ v_add_lshl_u32 v34, v77, v103, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v38, v39
+ v_pack_b32_f16 v32, v36, v37
+ v_add_lshl_u32 v34, v77, v102, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v42, v43
+ v_pack_b32_f16 v32, v40, v41
+ v_add_lshl_u32 v34, v90, v103, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v46, v47
+ v_pack_b32_f16 v32, v44, v45
+ v_add_lshl_u32 v34, v90, v102, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v32, v123
+ v_cvt_f16_f32_e32 v34, v124
+ v_cvt_f16_f32_e32 v33, v129
+ v_cvt_f16_f32_e32 v35, v131
+ v_cvt_f16_f32_e32 v36, v132
+ v_cvt_f16_f32_e32 v37, v137
+ v_cvt_f16_f32_e32 v38, v138
+ v_cvt_f16_f32_e32 v39, v139
+ v_cvt_f16_f32_e32 v40, v158
+ v_cvt_f16_f32_e32 v41, v159
+ v_cvt_f16_f32_e32 v42, v160
+ v_cvt_f16_f32_e32 v43, v171
+ v_cvt_f16_f32_e32 v44, v172
+ v_cvt_f16_f32_e32 v45, v173
+ v_cvt_f16_f32_e32 v46, v189
+ v_cvt_f16_f32_e32 v47, v190
+ v_pack_b32_f16 v33, v33, v35
+ v_pack_b32_f16 v32, v32, v34
+ v_add_lshl_u32 v34, v77, v72, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v38, v39
+ v_pack_b32_f16 v32, v36, v37
+ v_add_lshl_u32 v34, v77, v251, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v42, v43
+ v_pack_b32_f16 v32, v40, v41
+ v_add_lshl_u32 v34, v90, v72, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v46, v47
+ v_pack_b32_f16 v32, v44, v45
+ v_add_lshl_u32 v34, v90, v251, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v32, v136
+ v_cvt_f16_f32_e32 v34, v146
+ v_cvt_f16_f32_e32 v33, v147
+ v_cvt_f16_f32_e32 v35, v148
+ v_cvt_f16_f32_e32 v36, v165
+ v_cvt_f16_f32_e32 v37, v166
+ v_cvt_f16_f32_e32 v38, v167
+ v_cvt_f16_f32_e32 v39, v177
+ v_cvt_f16_f32_e32 v40, v178
+ v_cvt_f16_f32_e32 v41, v179
+ v_cvt_f16_f32_e32 v42, v194
+ v_cvt_f16_f32_e32 v43, v195
+ v_add_u32_e32 v92, s4, v90
+ v_cvt_f16_f32_e32 v44, v196
+ v_cvt_f16_f32_e32 v45, v203
+ v_cvt_f16_f32_e32 v46, v204
+ v_cvt_f16_f32_e32 v47, v205
+ v_pack_b32_f16 v33, v33, v35
+ v_pack_b32_f16 v32, v32, v34
+ v_add_lshl_u32 v34, v92, v108, 1
+ v_add_u32_e32 v87, s4, v92
+ ; sched_barrier mask(0x00000406)
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v38, v39
+ v_pack_b32_f16 v32, v36, v37
+ v_add_lshl_u32 v34, v92, v109, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v42, v43
+ v_pack_b32_f16 v32, v40, v41
+ v_add_lshl_u32 v34, v87, v108, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v46, v47
+ v_pack_b32_f16 v32, v44, v45
+ v_add_lshl_u32 v34, v87, v109, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v32, v174
+ v_cvt_f16_f32_e32 v34, v175
+ v_cvt_f16_f32_e32 v33, v176
+ v_cvt_f16_f32_e32 v35, v183
+ v_cvt_f16_f32_e32 v36, v184
+ v_cvt_f16_f32_e32 v37, v185
+ v_cvt_f16_f32_e32 v38, v200
+ v_cvt_f16_f32_e32 v39, v201
+ v_cvt_f16_f32_e32 v40, v202
+ v_cvt_f16_f32_e32 v41, v206
+ v_cvt_f16_f32_e32 v42, v207
+ v_cvt_f16_f32_e32 v43, v208
+ v_cvt_f16_f32_e32 v44, v213
+ v_cvt_f16_f32_e32 v45, v214
+ v_cvt_f16_f32_e32 v46, v215
+ v_cvt_f16_f32_e32 v47, v216
+ v_pack_b32_f16 v33, v33, v35
+ v_pack_b32_f16 v32, v32, v34
+ v_add_lshl_u32 v34, v92, v107, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v38, v39
+ v_pack_b32_f16 v32, v36, v37
+ v_add_lshl_u32 v34, v92, v106, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v42, v43
+ v_pack_b32_f16 v32, v40, v41
+ v_add_lshl_u32 v34, v87, v107, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v46, v47
+ v_pack_b32_f16 v32, v44, v45
+ v_add_lshl_u32 v34, v87, v106, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v32, v219
+ v_cvt_f16_f32_e32 v34, v220
+ v_cvt_f16_f32_e32 v33, v221
+ v_cvt_f16_f32_e32 v35, v222
+ v_cvt_f16_f32_e32 v36, v223
+ v_cvt_f16_f32_e32 v37, v224
+ v_cvt_f16_f32_e32 v38, v227
+ v_cvt_f16_f32_e32 v39, v228
+ v_cvt_f16_f32_e32 v40, v229
+ v_cvt_f16_f32_e32 v41, v233
+ v_cvt_f16_f32_e32 v42, v234
+ v_cvt_f16_f32_e32 v43, v235
+ v_cvt_f16_f32_e32 v44, v239
+ v_cvt_f16_f32_e32 v45, v240
+ v_cvt_f16_f32_e32 v46, v241
+ v_cvt_f16_f32_e32 v47, v245
+ v_pack_b32_f16 v33, v33, v35
+ v_pack_b32_f16 v32, v32, v34
+ v_add_lshl_u32 v34, v92, v103, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v38, v39
+ v_pack_b32_f16 v32, v36, v37
+ v_add_lshl_u32 v34, v92, v102, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v42, v43
+ v_pack_b32_f16 v32, v40, v41
+ v_add_lshl_u32 v34, v87, v103, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v46, v47
+ v_pack_b32_f16 v32, v44, v45
+ v_add_lshl_u32 v34, v87, v102, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v32, v225
+ v_cvt_f16_f32_e32 v34, v226
+ v_cvt_f16_f32_e32 v33, v230
+ v_cvt_f16_f32_e32 v35, v231
+ v_cvt_f16_f32_e32 v36, v232
+ v_cvt_f16_f32_e32 v37, v236
+ v_cvt_f16_f32_e32 v38, v237
+ v_cvt_f16_f32_e32 v39, v238
+ v_cvt_f16_f32_e32 v40, v242
+ v_cvt_f16_f32_e32 v41, v243
+ v_cvt_f16_f32_e32 v42, v244
+ v_cvt_f16_f32_e32 v43, v246
+ v_cvt_f16_f32_e32 v44, v247
+ v_cvt_f16_f32_e32 v45, v248
+ v_cvt_f16_f32_e32 v46, v249
+ v_cvt_f16_f32_e32 v47, v250
+ v_pack_b32_f16 v33, v33, v35
+ v_pack_b32_f16 v32, v32, v34
+ v_add_lshl_u32 v34, v92, v72, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v38, v39
+ v_pack_b32_f16 v32, v36, v37
+ v_add_lshl_u32 v34, v92, v251, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v42, v43
+ v_pack_b32_f16 v32, v40, v41
+ v_add_lshl_u32 v34, v87, v72, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_pack_b32_f16 v33, v46, v47
+ v_pack_b32_f16 v32, v44, v45
+ v_add_lshl_u32 v34, v87, v251, 1
+ buffer_store_dwordx2 v[32:33], v34, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v32, v17
+ v_cvt_f16_f32_e32 v17, v18
+ v_cvt_f16_f32_e32 v18, v19
+ v_cvt_f16_f32_e32 v19, v20
+ v_cvt_f16_f32_e32 v20, v22
+ v_cvt_f16_f32_e32 v22, v23
+ v_cvt_f16_f32_e32 v23, v24
+ v_cvt_f16_f32_e32 v24, v28
+ v_cvt_f16_f32_e32 v28, v29
+ v_cvt_f16_f32_e32 v29, v30
+ v_cvt_f16_f32_e32 v30, v112
+ v_add_u32_e32 v57, s4, v87
+ v_cvt_f16_f32_e32 v33, v113
+ v_cvt_f16_f32_e32 v34, v114
+ v_cvt_f16_f32_e32 v35, v118
+ v_cvt_f16_f32_e32 v36, v119
+ v_pack_b32_f16 v17, v17, v18
+ v_pack_b32_f16 v16, v16, v32
+ v_add_lshl_u32 v18, v57, v108, 1
+ v_add_u32_e32 v56, s4, v57
+ ; sched_barrier mask(0x00000406)
+ buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen
+ v_pack_b32_f16 v17, v22, v23
+ v_pack_b32_f16 v16, v19, v20
+ v_add_lshl_u32 v18, v57, v109, 1
+ buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen
+ v_pack_b32_f16 v17, v29, v30
+ v_pack_b32_f16 v16, v24, v28
+ v_add_lshl_u32 v18, v56, v108, 1
+ buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen
+ v_pack_b32_f16 v17, v35, v36
+ v_pack_b32_f16 v16, v33, v34
+ v_add_lshl_u32 v18, v56, v109, 1
+ buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v16, v21
+ v_cvt_f16_f32_e32 v18, v25
+ v_cvt_f16_f32_e32 v17, v26
+ v_cvt_f16_f32_e32 v19, v27
+ v_cvt_f16_f32_e32 v20, v31
+ v_cvt_f16_f32_e32 v21, v66
+ v_cvt_f16_f32_e32 v22, v111
+ v_cvt_f16_f32_e32 v23, v115
+ v_cvt_f16_f32_e32 v24, v116
+ v_cvt_f16_f32_e32 v25, v117
+ v_cvt_f16_f32_e32 v26, v120
+ v_cvt_f16_f32_e32 v27, v121
+ v_cvt_f16_f32_e32 v28, v122
+ v_cvt_f16_f32_e32 v29, v125
+ v_cvt_f16_f32_e32 v30, v126
+ v_cvt_f16_f32_e32 v31, v127
+ v_pack_b32_f16 v17, v17, v19
+ v_pack_b32_f16 v16, v16, v18
+ v_add_lshl_u32 v18, v57, v107, 1
+ buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen
+ v_pack_b32_f16 v17, v22, v23
+ v_pack_b32_f16 v16, v20, v21
+ v_add_lshl_u32 v18, v57, v106, 1
+ buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen
+ v_pack_b32_f16 v17, v26, v27
+ v_pack_b32_f16 v16, v24, v25
+ v_add_lshl_u32 v18, v56, v107, 1
+ buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen
+ v_pack_b32_f16 v17, v30, v31
+ v_pack_b32_f16 v16, v28, v29
+ v_add_lshl_u32 v18, v56, v106, 1
+ v_accvgpr_read_b32 v0, a20
+ v_accvgpr_read_b32 v7, a21
+ v_accvgpr_read_b32 v88, a22
+ v_accvgpr_read_b32 v91, a23
+ buffer_store_dwordx2 v[16:17], v18, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v16, v217
+ v_cvt_f16_f32_e32 v17, v218
+ v_cvt_f16_f32_e32 v18, v4
+ v_cvt_f16_f32_e32 v19, v5
+ v_cvt_f16_f32_e32 v0, v0
+ v_cvt_f16_f32_e32 v7, v7
+ v_cvt_f16_f32_e32 v20, v88
+ v_cvt_f16_f32_e32 v21, v91
+ v_pack_b32_f16 v5, v10, v11
+ v_pack_b32_f16 v4, v8, v9
+ v_add_lshl_u32 v8, v57, v103, 1
+ buffer_store_dwordx2 v[4:5], v8, s[0:3], 0 offen
+ v_pack_b32_f16 v5, v14, v15
+ v_pack_b32_f16 v4, v12, v13
+ v_add_lshl_u32 v8, v57, v102, 1
+ buffer_store_dwordx2 v[4:5], v8, s[0:3], 0 offen
+ v_pack_b32_f16 v5, v18, v19
+ v_pack_b32_f16 v4, v16, v17
+ v_add_lshl_u32 v8, v56, v103, 1
+ v_accvgpr_read_b32 v1, a4
+ v_accvgpr_read_b32 v2, a5
+ v_accvgpr_read_b32 v3, a6
+ v_accvgpr_read_b32 v6, a7
+ buffer_store_dwordx2 v[4:5], v8, s[0:3], 0 offen
+ v_pack_b32_f16 v5, v20, v21
+ v_pack_b32_f16 v4, v0, v7
+ v_add_lshl_u32 v0, v56, v102, 1
+ v_accvgpr_read_b32 v82, a8
+ v_accvgpr_read_b32 v79, a9
+ v_accvgpr_read_b32 v83, a10
+ v_accvgpr_read_b32 v84, a11
+ buffer_store_dwordx2 v[4:5], v0, s[0:3], 0 offen
+ v_cvt_f16_f32_e32 v0, v1
+ v_cvt_f16_f32_e32 v2, v2
+ v_cvt_f16_f32_e32 v1, v3
+ v_cvt_f16_f32_e32 v3, v6
+ v_accvgpr_read_b32 v86, a0
+ v_accvgpr_read_b32 v78, a1
+ v_accvgpr_read_b32 v80, a2
+ v_accvgpr_read_b32 v81, a3
+ v_cvt_f16_f32_e32 v4, v82
+ v_cvt_f16_f32_e32 v5, v79
+ v_cvt_f16_f32_e32 v6, v83
+ v_cvt_f16_f32_e32 v7, v84
+ v_accvgpr_read_b32 v85, a12
+ v_accvgpr_read_b32 v70, a13
+ v_accvgpr_read_b32 v74, a14
+ v_accvgpr_read_b32 v76, a15
+ v_cvt_f16_f32_e32 v8, v86
+ v_cvt_f16_f32_e32 v9, v78
+ v_cvt_f16_f32_e32 v10, v80
+ v_cvt_f16_f32_e32 v11, v81
+ v_cvt_f16_f32_e32 v12, v85
+ v_cvt_f16_f32_e32 v13, v70
+ v_cvt_f16_f32_e32 v14, v74
+ v_cvt_f16_f32_e32 v15, v76
+ v_pack_b32_f16 v1, v1, v3
+ v_pack_b32_f16 v0, v0, v2
+ v_add_lshl_u32 v2, v57, v72, 1
+ buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+ v_pack_b32_f16 v1, v6, v7
+ v_pack_b32_f16 v0, v4, v5
+ v_add_lshl_u32 v2, v57, v251, 1
+ buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+ v_pack_b32_f16 v1, v10, v11
+ v_pack_b32_f16 v0, v8, v9
+ v_add_lshl_u32 v2, v56, v72, 1
+ buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+ v_pack_b32_f16 v1, v14, v15
+ v_pack_b32_f16 v0, v12, v13
+ v_add_lshl_u32 v2, v56, v251, 1
+ ; sched_barrier mask(0x000007F6)
+ buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+ s_endpgm
+ .section .rodata,"a", at progbits
+ .p2align 6, 0x0
+ .amdhsa_kernel matmul_kernel
+ .amdhsa_group_segment_fixed_size 0
+ .amdhsa_private_segment_fixed_size 0
+ .amdhsa_kernarg_size 72
+ .amdhsa_user_sgpr_count 15
+ .amdhsa_user_sgpr_dispatch_ptr 0
+ .amdhsa_user_sgpr_queue_ptr 0
+ .amdhsa_user_sgpr_kernarg_segment_ptr 1
+ .amdhsa_user_sgpr_dispatch_id 0
+ .amdhsa_user_sgpr_kernarg_preload_length 13
+ .amdhsa_user_sgpr_kernarg_preload_offset 0
+ .amdhsa_user_sgpr_private_segment_size 0
+ .amdhsa_enable_private_segment 0
+ .amdhsa_system_sgpr_workgroup_id_x 1
+ .amdhsa_system_sgpr_workgroup_id_y 0
+ .amdhsa_system_sgpr_workgroup_id_z 0
+ .amdhsa_system_sgpr_workgroup_info 0
+ .amdhsa_system_vgpr_workitem_id 0
+ .amdhsa_next_free_vgpr 512
+ .amdhsa_next_free_sgpr 30
+ .amdhsa_accum_offset 256
+ .amdhsa_reserve_vcc 1
+ .amdhsa_reserve_xnack_mask 1
+ .amdhsa_float_round_mode_32 0
+ .amdhsa_float_round_mode_16_64 0
+ .amdhsa_float_denorm_mode_32 3
+ .amdhsa_float_denorm_mode_16_64 3
+ .amdhsa_dx10_clamp 1
+ .amdhsa_ieee_mode 1
+ .amdhsa_fp16_overflow 0
+ .amdhsa_tg_split 0
+ .amdhsa_exception_fp_ieee_invalid_op 0
+ .amdhsa_exception_fp_denorm_src 0
+ .amdhsa_exception_fp_ieee_div_zero 0
+ .amdhsa_exception_fp_ieee_overflow 0
+ .amdhsa_exception_fp_ieee_underflow 0
+ .amdhsa_exception_fp_ieee_inexact 0
+ .amdhsa_exception_int_div_zero 0
+ .end_amdhsa_kernel
+ .text
+.Lfunc_end0:
+ .size matmul_kernel, .Lfunc_end0-matmul_kernel
+ .cfi_endproc
+ ; -- End function
+ .set matmul_kernel.num_vgpr, 256
+ .set matmul_kernel.num_agpr, 256
+ .set matmul_kernel.numbered_sgpr, 30
+ .set matmul_kernel.private_seg_size, 0
+ .set matmul_kernel.uses_vcc, 1
+ .set matmul_kernel.uses_flat_scratch, 0
+ .set matmul_kernel.has_dyn_sized_stack, 0
+ .set matmul_kernel.has_recursion, 0
+ .set matmul_kernel.has_indirect_call, 0
+ .section .AMDGPU.csdata,"", at progbits
+; Kernel info:
+; codeLenInByte = 22432
+; TotalNumSgprs: 36
+; NumVgprs: 256
+; NumAgprs: 256
+; TotalNumVgprs: 512
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 4
+; VGPRBlocks: 63
+; NumSGPRsForWavesPerEU: 36
+; NumVGPRsForWavesPerEU: 512
+; AccumOffset: 256
+; Occupancy: 1
+; WaveLimiterHint : 0
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 15
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+; COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63
+; COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0
+ .text
+ .p2alignl 6, 3212836864
+ .fill 256, 4, 3212836864
+ .section .AMDGPU.gpr_maximums,"", at progbits
+ .set amdgpu.max_num_vgpr, 0
+ .set amdgpu.max_num_agpr, 0
+ .set amdgpu.max_num_sgpr, 0
+ .text
+ .section .debug_abbrev,"", at progbits
+ .byte 1 ; Abbreviation Code
+ .byte 17 ; DW_TAG_compile_unit
+ .byte 0 ; DW_CHILDREN_no
+ .byte 37 ; DW_AT_producer
+ .byte 14 ; DW_FORM_strp
+ .byte 19 ; DW_AT_language
+ .byte 5 ; DW_FORM_data2
+ .byte 3 ; DW_AT_name
+ .byte 14 ; DW_FORM_strp
+ .byte 16 ; DW_AT_stmt_list
+ .byte 23 ; DW_FORM_sec_offset
+ .byte 17 ; DW_AT_low_pc
+ .byte 1 ; DW_FORM_addr
+ .byte 18 ; DW_AT_high_pc
+ .byte 6 ; DW_FORM_data4
+ .byte 0 ; EOM(1)
+ .byte 0 ; EOM(2)
+ .byte 0 ; EOM(3)
+ .section .debug_info,"", at progbits
+.Lcu_begin0:
+ .long .Ldebug_info_end0-.Ldebug_info_start0 ; Length of Unit
+.Ldebug_info_start0:
+ .short 4 ; DWARF version number
+ .long .debug_abbrev ; Offset Into Abbrev. Section
+ .byte 8 ; Address Size (in bytes)
+ .byte 1 ; Abbrev [1] 0xb:0x1b DW_TAG_compile_unit
+ .long .Linfo_string0 ; DW_AT_producer
+ .short 2 ; DW_AT_language
+ .long .Linfo_string1 ; DW_AT_name
+ .long .Lline_table_start0 ; DW_AT_stmt_list
+ .quad .Lfunc_begin0 ; DW_AT_low_pc
+ .long .Lfunc_end0-.Lfunc_begin0 ; DW_AT_high_pc
+.Ldebug_info_end0:
+ .section .debug_str,"MS", at progbits,1
+.Linfo_string0:
+ .asciz "triton" ; string offset=0
+.Linfo_string1:
+ .asciz "<unknown>" ; string offset=7
+ .section ".note.GNU-stack","", at progbits
+ .amdgpu_metadata
+---
+amdhsa.kernels:
+ - .agpr_count: 256
+ .args:
+ - .address_space: global
+ .offset: 0
+ .size: 8
+ .value_kind: global_buffer
+ - .address_space: global
+ .offset: 8
+ .size: 8
+ .value_kind: global_buffer
+ - .address_space: global
+ .offset: 16
+ .size: 8
+ .value_kind: global_buffer
+ - .address_space: global
+ .offset: 24
+ .size: 8
+ .value_kind: global_buffer
+ - .offset: 32
+ .size: 4
+ .value_kind: by_value
+ - .offset: 36
+ .size: 4
+ .value_kind: by_value
+ - .offset: 40
+ .size: 4
+ .value_kind: by_value
+ - .offset: 44
+ .size: 4
+ .value_kind: by_value
+ - .offset: 48
+ .size: 4
+ .value_kind: by_value
+ - .offset: 52
+ .size: 4
+ .value_kind: by_value
+ - .offset: 56
+ .size: 4
+ .value_kind: by_value
+ - .address_space: global
+ .offset: 64
+ .size: 8
+ .value_kind: global_buffer
+ .group_segment_fixed_size: 0
+ .kernarg_segment_align: 8
+ .kernarg_segment_size: 72
+ .max_flat_workgroup_size: 256
+ .name: matmul_kernel
+ .private_segment_fixed_size: 0
+ .sgpr_count: 36
+ .sgpr_spill_count: 0
+ .symbol: matmul_kernel.kd
+ .vgpr_count: 512
+ .vgpr_spill_count: 0
+ .wavefront_size: 64
+amdhsa.target: amdgcn-amd-amdhsa--gfx942
+amdhsa.version:
+ - 1
+ - 1
+...
+
+ .end_amdgpu_metadata
+ .section .debug_line,"", at progbits
+.Lline_table_start0:
diff --git a/llvm/test/CodeGen/AMDGPU/4_tlp_fast.llir b/llvm/test/CodeGen/AMDGPU/4_tlp_fast.llir
new file mode 100644
index 0000000000000..fb167d35c1c61
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/4_tlp_fast.llir
@@ -0,0 +1,5722 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+ at global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nofree norecurse nounwind
+define amdgpu_kernel void @matmul_kernel(ptr addrspace(1) inreg nocapture readonly %0, ptr addrspace(1) inreg nocapture readonly %1, ptr addrspace(1) inreg nocapture writeonly %2, ptr addrspace(1) inreg nocapture readnone %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, ptr addrspace(1) inreg nocapture readnone %11) local_unnamed_addr #0 !dbg !4 {
+ %13 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %14 = sdiv i32 %13, 8
+ %15 = mul i32 %13, 76
+ %16 = mul i32 %14, -607
+ %17 = add i32 %16, %15
+ %18 = add i32 %5, 255
+ %19 = sdiv i32 %18, 256
+ %20 = shl nsw i32 %19, 2
+ %.frozen = freeze i32 %20
+ %21 = sdiv i32 %17, %.frozen
+ %22 = shl nsw i32 %21, 2
+ %23 = mul i32 %21, %.frozen
+ %.decomposed = sub i32 %17, %23
+ %24 = add i32 %4, 255
+ %25 = sdiv i32 %24, 256
+ %26 = sub nsw i32 %25, %22
+ %27 = tail call i32 @llvm.smin.i32(i32 %26, i32 4)
+ %.decomposed.frozen = freeze i32 %.decomposed
+ %.frozen2426 = freeze i32 %27
+ %28 = sdiv i32 %.decomposed.frozen, %.frozen2426
+ %29 = mul i32 %28, %.frozen2426
+ %.decomposed2427 = sub i32 %.decomposed.frozen, %29
+ %30 = add nsw i32 %.decomposed2427, %22
+ %31 = shl i32 %30, 8
+ %32 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %33 = lshr i32 %32, 3
+ %34 = and i32 %33, 16
+ %35 = and i32 %33, 31
+ %36 = or disjoint i32 %35, 32
+ %37 = or disjoint i32 %35, 64
+ %38 = or disjoint i32 %35, 96
+ %39 = or disjoint i32 %35, 128
+ %40 = or disjoint i32 %35, 160
+ %41 = or disjoint i32 %35, 192
+ %42 = or disjoint i32 %35, 224
+ %43 = mul i32 %31, %7
+ %44 = mul i32 %7, %35
+ %45 = mul i32 %7, %36
+ %46 = mul i32 %7, %37
+ %47 = mul i32 %7, %38
+ %48 = mul i32 %7, %39
+ %49 = mul i32 %7, %40
+ %50 = mul i32 %7, %41
+ %51 = mul i32 %7, %42
+ %52 = sext i32 %43 to i64
+ %53 = getelementptr half, ptr addrspace(1) %0, i64 %52
+ %54 = shl i32 %32, 3
+ %55 = and i32 %54, 56
+ %56 = add i32 %44, %55
+ %57 = add i32 %45, %55
+ %58 = add i32 %46, %55
+ %59 = add i32 %47, %55
+ %60 = add i32 %48, %55
+ %61 = add i32 %49, %55
+ %62 = add i32 %50, %55
+ %63 = add i32 %51, %55
+ %64 = getelementptr i8, ptr addrspace(1) %53, i64 128
+ %65 = add i32 %6, 63
+ %66 = icmp sgt i32 %65, 63
+ %67 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %53, i16 0, i32 2147483646, i32 159744)
+ %68 = shl i32 %56, 1
+ %69 = select i1 %66, i32 %68, i32 -2147483648
+ %70 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %69, i32 0, i32 0)
+ %71 = shl i32 %57, 1
+ %72 = select i1 %66, i32 %71, i32 -2147483648
+ %73 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %72, i32 0, i32 0)
+ %74 = shl i32 %58, 1
+ %75 = select i1 %66, i32 %74, i32 -2147483648
+ %76 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %75, i32 0, i32 0)
+ %77 = shl i32 %59, 1
+ %78 = select i1 %66, i32 %77, i32 -2147483648
+ %79 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %78, i32 0, i32 0)
+ %80 = shl i32 %60, 1
+ %81 = select i1 %66, i32 %80, i32 -2147483648
+ %82 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %81, i32 0, i32 0)
+ %83 = shl i32 %61, 1
+ %84 = select i1 %66, i32 %83, i32 -2147483648
+ %85 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %84, i32 0, i32 0)
+ %86 = shl i32 %62, 1
+ %87 = select i1 %66, i32 %86, i32 -2147483648
+ %88 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %87, i32 0, i32 0)
+ %89 = shl i32 %63, 1
+ %90 = select i1 %66, i32 %89, i32 -2147483648
+ %91 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %90, i32 0, i32 0)
+ %92 = icmp sgt i32 %65, 127
+ %93 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %64, i16 0, i32 2147483646, i32 159744)
+ %94 = select i1 %92, i32 %68, i32 -2147483648
+ %95 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %94, i32 0, i32 0)
+ %96 = bitcast <4 x i32> %95 to <8 x half>
+ %97 = select i1 %92, i32 %71, i32 -2147483648
+ %98 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %97, i32 0, i32 0)
+ %99 = bitcast <4 x i32> %98 to <8 x half>
+ %100 = select i1 %92, i32 %74, i32 -2147483648
+ %101 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %100, i32 0, i32 0)
+ %102 = bitcast <4 x i32> %101 to <8 x half>
+ %103 = select i1 %92, i32 %77, i32 -2147483648
+ %104 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %103, i32 0, i32 0)
+ %105 = bitcast <4 x i32> %104 to <8 x half>
+ %106 = select i1 %92, i32 %80, i32 -2147483648
+ %107 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %106, i32 0, i32 0)
+ %108 = bitcast <4 x i32> %107 to <8 x half>
+ %109 = select i1 %92, i32 %83, i32 -2147483648
+ %110 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %109, i32 0, i32 0)
+ %111 = bitcast <4 x i32> %110 to <8 x half>
+ %112 = select i1 %92, i32 %86, i32 -2147483648
+ %113 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %112, i32 0, i32 0)
+ %114 = bitcast <4 x i32> %113 to <8 x half>
+ %115 = select i1 %92, i32 %89, i32 -2147483648
+ %116 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %115, i32 0, i32 0)
+ %117 = bitcast <4 x i32> %116 to <8 x half>
+ %118 = shl i32 %28, 8
+ %119 = mul i32 %118, %8
+ %120 = mul i32 %8, %35
+ %121 = mul i32 %8, %36
+ %122 = mul i32 %8, %37
+ %123 = mul i32 %8, %38
+ %124 = mul i32 %8, %39
+ %125 = mul i32 %8, %40
+ %126 = mul i32 %8, %41
+ %127 = mul i32 %8, %42
+ %128 = sext i32 %119 to i64
+ %129 = getelementptr half, ptr addrspace(1) %1, i64 %128
+ %130 = add i32 %120, %55
+ %131 = add i32 %121, %55
+ %132 = add i32 %122, %55
+ %133 = add i32 %123, %55
+ %134 = add i32 %124, %55
+ %135 = add i32 %125, %55
+ %136 = add i32 %126, %55
+ %137 = add i32 %127, %55
+ %138 = getelementptr i8, ptr addrspace(1) %129, i64 128
+ %139 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %129, i16 0, i32 2147483646, i32 159744)
+ %140 = shl i32 %130, 1
+ %141 = select i1 %66, i32 %140, i32 -2147483648
+ %142 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %141, i32 0, i32 0)
+ %143 = shl i32 %131, 1
+ %144 = select i1 %66, i32 %143, i32 -2147483648
+ %145 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %144, i32 0, i32 0)
+ %146 = shl i32 %132, 1
+ %147 = select i1 %66, i32 %146, i32 -2147483648
+ %148 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %147, i32 0, i32 0)
+ %149 = shl i32 %133, 1
+ %150 = select i1 %66, i32 %149, i32 -2147483648
+ %151 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %150, i32 0, i32 0)
+ %152 = shl i32 %134, 1
+ %153 = select i1 %66, i32 %152, i32 -2147483648
+ %154 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %153, i32 0, i32 0)
+ %155 = shl i32 %135, 1
+ %156 = select i1 %66, i32 %155, i32 -2147483648
+ %157 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %156, i32 0, i32 0)
+ %158 = shl i32 %136, 1
+ %159 = select i1 %66, i32 %158, i32 -2147483648
+ %160 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %159, i32 0, i32 0)
+ %161 = shl i32 %137, 1
+ %162 = select i1 %66, i32 %161, i32 -2147483648
+ %163 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %162, i32 0, i32 0)
+ %164 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %138, i16 0, i32 2147483646, i32 159744)
+ %165 = select i1 %92, i32 %140, i32 -2147483648
+ %166 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %165, i32 0, i32 0)
+ %167 = bitcast <4 x i32> %166 to <8 x half>
+ %168 = select i1 %92, i32 %143, i32 -2147483648
+ %169 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %168, i32 0, i32 0)
+ %170 = bitcast <4 x i32> %169 to <8 x half>
+ %171 = select i1 %92, i32 %146, i32 -2147483648
+ %172 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %171, i32 0, i32 0)
+ %173 = bitcast <4 x i32> %172 to <8 x half>
+ %174 = select i1 %92, i32 %149, i32 -2147483648
+ %175 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %174, i32 0, i32 0)
+ %176 = bitcast <4 x i32> %175 to <8 x half>
+ %177 = select i1 %92, i32 %152, i32 -2147483648
+ %178 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %177, i32 0, i32 0)
+ %179 = bitcast <4 x i32> %178 to <8 x half>
+ %180 = select i1 %92, i32 %155, i32 -2147483648
+ %181 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %180, i32 0, i32 0)
+ %182 = bitcast <4 x i32> %181 to <8 x half>
+ %183 = select i1 %92, i32 %158, i32 -2147483648
+ %184 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %183, i32 0, i32 0)
+ %185 = bitcast <4 x i32> %184 to <8 x half>
+ %186 = select i1 %92, i32 %161, i32 -2147483648
+ %187 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %186, i32 0, i32 0)
+ %188 = bitcast <4 x i32> %187 to <8 x half>
+ %189 = icmp sgt i32 %7, 0
+ tail call void @llvm.assume(i1 %189)
+ %190 = icmp sgt i32 %8, 0
+ tail call void @llvm.assume(i1 %190)
+ %191 = icmp sgt i32 %9, 0
+ tail call void @llvm.assume(i1 %191)
+ %192 = icmp sgt i32 %10, 0
+ tail call void @llvm.assume(i1 %192)
+ %193 = icmp sgt i32 %30, 0
+ tail call void @llvm.assume(i1 %193)
+ %194 = icmp sgt i32 %28, 0
+ tail call void @llvm.assume(i1 %194)
+ %195 = xor i32 %54, %32
+ %196 = and i32 %195, 56
+ %197 = shl nuw nsw i32 %35, 6
+ %198 = or disjoint i32 %197, %196
+ %199 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %198
+ store <4 x i32> %70, ptr addrspace(3) %199, align 16
+ %200 = or disjoint i32 %198, 2048
+ %201 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %200
+ store <4 x i32> %73, ptr addrspace(3) %201, align 16
+ %202 = or disjoint i32 %198, 4096
+ %203 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %202
+ store <4 x i32> %76, ptr addrspace(3) %203, align 16
+ %204 = or disjoint i32 %198, 6144
+ %205 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %204
+ store <4 x i32> %79, ptr addrspace(3) %205, align 16
+ %206 = or disjoint i32 %198, 8192
+ %207 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %206
+ store <4 x i32> %82, ptr addrspace(3) %207, align 16
+ %208 = or disjoint i32 %198, 10240
+ %209 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %208
+ store <4 x i32> %85, ptr addrspace(3) %209, align 16
+ %210 = or disjoint i32 %198, 12288
+ %211 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %210
+ store <4 x i32> %88, ptr addrspace(3) %211, align 16
+ %212 = or disjoint i32 %198, 14336
+ %213 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %212
+ store <4 x i32> %91, ptr addrspace(3) %213, align 16
+ %214 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %198
+ store <4 x i32> %142, ptr addrspace(3) %214, align 16
+ %215 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %200
+ store <4 x i32> %145, ptr addrspace(3) %215, align 16
+ %216 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %202
+ store <4 x i32> %148, ptr addrspace(3) %216, align 16
+ %217 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %204
+ store <4 x i32> %151, ptr addrspace(3) %217, align 16
+ %218 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %206
+ store <4 x i32> %154, ptr addrspace(3) %218, align 16
+ %219 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %208
+ store <4 x i32> %157, ptr addrspace(3) %219, align 16
+ %220 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %210
+ store <4 x i32> %160, ptr addrspace(3) %220, align 16
+ %221 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %212
+ store <4 x i32> %163, ptr addrspace(3) %221, align 16
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %222 = and i32 %32, 15
+ %223 = lshr i32 %32, 4
+ %224 = and i32 %223, 3
+ %225 = or disjoint i32 %34, %222
+ %226 = and i32 %32, 7
+ %227 = xor i32 %224, %226
+ %228 = shl nuw nsw i32 %227, 3
+ %229 = shl nuw nsw i32 %225, 6
+ %230 = or disjoint i32 %229, %228
+ %231 = or disjoint i32 %229, 2048
+ %232 = or disjoint i32 %231, %228
+ %233 = getelementptr half, ptr addrspace(3) @global_smem, i32 %230
+ %234 = load <8 x half>, ptr addrspace(3) %233, align 16
+ %235 = getelementptr half, ptr addrspace(3) @global_smem, i32 %232
+ %236 = load <8 x half>, ptr addrspace(3) %235, align 16
+ %237 = lshr i32 %32, 2
+ %238 = and i32 %237, 16
+ %239 = or disjoint i32 %238, %222
+ %240 = shl nuw nsw i32 %239, 6
+ %241 = or disjoint i32 %228, %240
+ %242 = or disjoint i32 %241, 2048
+ %243 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %241
+ %244 = load <8 x half>, ptr addrspace(3) %243, align 16
+ %245 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %242
+ %246 = load <8 x half>, ptr addrspace(3) %245, align 16
+ %247 = or disjoint i32 %229, 4096
+ %248 = or disjoint i32 %247, %228
+ %249 = or disjoint i32 %229, 6144
+ %250 = or disjoint i32 %249, %228
+ %251 = getelementptr half, ptr addrspace(3) @global_smem, i32 %248
+ %252 = load <8 x half>, ptr addrspace(3) %251, align 16
+ %253 = getelementptr half, ptr addrspace(3) @global_smem, i32 %250
+ %254 = load <8 x half>, ptr addrspace(3) %253, align 16
+ %255 = or disjoint i32 %240, 4096
+ %256 = or disjoint i32 %255, %228
+ %257 = or disjoint i32 %256, 2048
+ %258 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %256
+ %259 = load <8 x half>, ptr addrspace(3) %258, align 16
+ %260 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %257
+ %261 = load <8 x half>, ptr addrspace(3) %260, align 16
+ %262 = icmp sgt i32 %65, 191
+ br i1 %262, label %.lr.ph, label %.._crit_edge_crit_edge
+
+.._crit_edge_crit_edge: ; preds = %12
+ %.pre = or disjoint i32 %240, 8192
+ %.pre1013 = or disjoint i32 %.pre, %228
+ %.pre1015 = or disjoint i32 %240, 12288
+ %.pre1017 = or disjoint i32 %.pre1015, %228
+ %.pre1019 = or disjoint i32 %229, 8192
+ %.pre1021 = or disjoint i32 %.pre1019, %228
+ %.pre1023 = or disjoint i32 %229, 10240
+ %.pre1025 = or disjoint i32 %.pre1023, %228
+ %.pre1027 = or disjoint i32 %229, 12288
+ %.pre1029 = or disjoint i32 %.pre1027, %228
+ %.pre1031 = or disjoint i32 %229, 14336
+ %.pre1033 = or disjoint i32 %.pre1031, %228
+ %.pre1035 = or disjoint i32 %224, 4
+ %.pre1037 = xor i32 %.pre1035, %226
+ %.pre1039 = shl nuw nsw i32 %.pre1037, 3
+ %.pre1041 = or disjoint i32 %.pre1039, %240
+ %.pre1043 = or disjoint i32 %.pre1039, %229
+ %.pre1045 = or disjoint i32 %231, %.pre1039
+ %.pre1047 = or disjoint i32 %.pre1039, %255
+ %.pre1049 = or disjoint i32 %247, %.pre1039
+ %.pre1051 = or disjoint i32 %249, %.pre1039
+ %.pre1053 = or disjoint i32 %.pre1039, %.pre
+ %.pre1055 = or disjoint i32 %.pre1053, 2048
+ %.pre1057 = or disjoint i32 %.pre1039, %.pre1015
+ %.pre1059 = or disjoint i32 %.pre1057, 2048
+ %.pre1061 = or disjoint i32 %.pre1019, %.pre1039
+ %.pre1063 = or disjoint i32 %.pre1023, %.pre1039
+ %.pre1065 = or disjoint i32 %.pre1027, %.pre1039
+ %.pre1067 = or disjoint i32 %.pre1031, %.pre1039
+ %263 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %264 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %265 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %266 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %267 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %268 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %269 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %270 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %271 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %272 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %273 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %274 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %275 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %276 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %277 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %278 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %279 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %280 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %281 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %282 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %283 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %284 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %285 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %286 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %287 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %288 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %289 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %290 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %291 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %292 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %293 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %294 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %295 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %296 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %297 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %298 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %299 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %300 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %301 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %302 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %303 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %304 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %305 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %306 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %307 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %308 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %309 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %310 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %311 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %312 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %313 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %314 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %315 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %316 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %317 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %318 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %319 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %320 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %321 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %322 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %323 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %324 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %325 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %326 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %327 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %328 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %329 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %330 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %331 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %332 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %333 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %334 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %335 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %336 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %337 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %338 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %339 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %340 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %341 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %342 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %343 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %344 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %345 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %346 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %347 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %348 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %349 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %350 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %351 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %352 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %353 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %354 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %355 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %356 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %357 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %358 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ br label %._crit_edge
+
+.lr.ph: ; preds = %12
+ %359 = lshr i32 %65, 6
+ %invariant.op404 = or disjoint i32 %240, 6144
+ %invariant.op402 = or disjoint i32 %240, 2048
+ %invariant.op400 = or disjoint i32 %228, 2048
+ %360 = or disjoint i32 %240, 8192
+ %361 = or disjoint i32 %360, %228
+ %.reass = or disjoint i32 %360, %invariant.op400
+ %362 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %361
+ %363 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass
+ %364 = or disjoint i32 %240, 12288
+ %365 = or disjoint i32 %364, %228
+ %.reass401 = or disjoint i32 %364, %invariant.op400
+ %366 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %365
+ %367 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass401
+ %368 = or disjoint i32 %229, 8192
+ %369 = or disjoint i32 %368, %228
+ %370 = or disjoint i32 %229, 10240
+ %371 = or disjoint i32 %370, %228
+ %372 = getelementptr half, ptr addrspace(3) @global_smem, i32 %369
+ %373 = getelementptr half, ptr addrspace(3) @global_smem, i32 %371
+ %374 = or disjoint i32 %229, 12288
+ %375 = or disjoint i32 %374, %228
+ %376 = or disjoint i32 %229, 14336
+ %377 = or disjoint i32 %376, %228
+ %378 = getelementptr half, ptr addrspace(3) @global_smem, i32 %375
+ %379 = getelementptr half, ptr addrspace(3) @global_smem, i32 %377
+ %380 = or disjoint i32 %224, 4
+ %381 = xor i32 %380, %226
+ %382 = shl nuw nsw i32 %381, 3
+ %383 = or disjoint i32 %382, %229
+ %384 = or disjoint i32 %231, %382
+ %385 = getelementptr half, ptr addrspace(3) @global_smem, i32 %383
+ %386 = getelementptr half, ptr addrspace(3) @global_smem, i32 %384
+ %387 = or disjoint i32 %382, %240
+ %.reass403 = or disjoint i32 %382, %invariant.op402
+ %388 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %387
+ %389 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass403
+ %390 = or disjoint i32 %382, %255
+ %.reass405 = or disjoint i32 %382, %invariant.op404
+ %391 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %390
+ %392 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass405
+ %393 = or disjoint i32 %247, %382
+ %394 = or disjoint i32 %249, %382
+ %395 = getelementptr half, ptr addrspace(3) @global_smem, i32 %393
+ %396 = getelementptr half, ptr addrspace(3) @global_smem, i32 %394
+ %397 = or disjoint i32 %382, %360
+ %398 = or disjoint i32 %397, 2048
+ %399 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %397
+ %400 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %398
+ %401 = or disjoint i32 %382, %364
+ %402 = or disjoint i32 %401, 2048
+ %403 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %401
+ %404 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %402
+ %405 = or disjoint i32 %368, %382
+ %406 = or disjoint i32 %370, %382
+ %407 = getelementptr half, ptr addrspace(3) @global_smem, i32 %405
+ %408 = getelementptr half, ptr addrspace(3) @global_smem, i32 %406
+ %409 = or disjoint i32 %374, %382
+ %410 = or disjoint i32 %376, %382
+ %411 = getelementptr half, ptr addrspace(3) @global_smem, i32 %409
+ %412 = getelementptr half, ptr addrspace(3) @global_smem, i32 %410
+ %413 = add nsw i32 %359, -3
+ %414 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %415 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %416 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %417 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %418 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %419 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %420 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %421 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %422 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %423 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %424 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %425 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %426 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %427 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %428 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %429 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %430 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %431 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %432 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %433 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %434 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %435 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %436 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %437 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %438 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %439 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %440 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %441 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %442 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %443 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %444 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %445 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %446 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %447 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %448 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %449 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %450 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %451 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %452 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %453 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %454 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %455 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %456 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %457 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %458 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %459 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %460 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %461 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %462 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %463 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %464 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %465 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %466 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %467 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %468 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %469 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %470 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %471 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %472 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %473 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %474 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %475 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %476 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %477 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %478 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %479 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %480 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %481 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %482 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %483 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %484 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %485 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %486 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %487 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %488 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %489 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %490 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %491 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %492 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %493 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %494 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %495 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %496 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %497 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %498 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %499 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %500 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %501 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %502 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %503 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %504 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %505 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %506 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %507 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %508 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %509 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ br label %510
+
+510: ; preds = %.lr.ph, %510
+ %511 = phi float [ 0.000000e+00, %.lr.ph ], [ %1824, %510 ]
+ %512 = phi float [ 0.000000e+00, %.lr.ph ], [ %1825, %510 ]
+ %513 = phi float [ 0.000000e+00, %.lr.ph ], [ %1826, %510 ]
+ %514 = phi float [ 0.000000e+00, %.lr.ph ], [ %1827, %510 ]
+ %515 = phi float [ 0.000000e+00, %.lr.ph ], [ %1830, %510 ]
+ %516 = phi float [ 0.000000e+00, %.lr.ph ], [ %1831, %510 ]
+ %517 = phi float [ 0.000000e+00, %.lr.ph ], [ %1832, %510 ]
+ %518 = phi float [ 0.000000e+00, %.lr.ph ], [ %1833, %510 ]
+ %519 = phi float [ 0.000000e+00, %.lr.ph ], [ %1836, %510 ]
+ %520 = phi float [ 0.000000e+00, %.lr.ph ], [ %1837, %510 ]
+ %521 = phi float [ 0.000000e+00, %.lr.ph ], [ %1838, %510 ]
+ %522 = phi float [ 0.000000e+00, %.lr.ph ], [ %1839, %510 ]
+ %523 = phi float [ 0.000000e+00, %.lr.ph ], [ %1842, %510 ]
+ %524 = phi float [ 0.000000e+00, %.lr.ph ], [ %1843, %510 ]
+ %525 = phi float [ 0.000000e+00, %.lr.ph ], [ %1844, %510 ]
+ %526 = phi float [ 0.000000e+00, %.lr.ph ], [ %1845, %510 ]
+ %527 = phi float [ 0.000000e+00, %.lr.ph ], [ %1800, %510 ]
+ %528 = phi float [ 0.000000e+00, %.lr.ph ], [ %1801, %510 ]
+ %529 = phi float [ 0.000000e+00, %.lr.ph ], [ %1802, %510 ]
+ %530 = phi float [ 0.000000e+00, %.lr.ph ], [ %1803, %510 ]
+ %531 = phi float [ 0.000000e+00, %.lr.ph ], [ %1806, %510 ]
+ %532 = phi float [ 0.000000e+00, %.lr.ph ], [ %1807, %510 ]
+ %533 = phi float [ 0.000000e+00, %.lr.ph ], [ %1808, %510 ]
+ %534 = phi float [ 0.000000e+00, %.lr.ph ], [ %1809, %510 ]
+ %535 = phi float [ 0.000000e+00, %.lr.ph ], [ %1812, %510 ]
+ %536 = phi float [ 0.000000e+00, %.lr.ph ], [ %1813, %510 ]
+ %537 = phi float [ 0.000000e+00, %.lr.ph ], [ %1814, %510 ]
+ %538 = phi float [ 0.000000e+00, %.lr.ph ], [ %1815, %510 ]
+ %539 = phi float [ 0.000000e+00, %.lr.ph ], [ %1818, %510 ]
+ %540 = phi float [ 0.000000e+00, %.lr.ph ], [ %1819, %510 ]
+ %541 = phi float [ 0.000000e+00, %.lr.ph ], [ %1820, %510 ]
+ %542 = phi float [ 0.000000e+00, %.lr.ph ], [ %1821, %510 ]
+ %543 = phi float [ 0.000000e+00, %.lr.ph ], [ %1720, %510 ]
+ %544 = phi float [ 0.000000e+00, %.lr.ph ], [ %1721, %510 ]
+ %545 = phi float [ 0.000000e+00, %.lr.ph ], [ %1722, %510 ]
+ %546 = phi float [ 0.000000e+00, %.lr.ph ], [ %1723, %510 ]
+ %547 = phi float [ 0.000000e+00, %.lr.ph ], [ %1726, %510 ]
+ %548 = phi float [ 0.000000e+00, %.lr.ph ], [ %1727, %510 ]
+ %549 = phi float [ 0.000000e+00, %.lr.ph ], [ %1728, %510 ]
+ %550 = phi float [ 0.000000e+00, %.lr.ph ], [ %1729, %510 ]
+ %551 = phi float [ 0.000000e+00, %.lr.ph ], [ %1732, %510 ]
+ %552 = phi float [ 0.000000e+00, %.lr.ph ], [ %1733, %510 ]
+ %553 = phi float [ 0.000000e+00, %.lr.ph ], [ %1734, %510 ]
+ %554 = phi float [ 0.000000e+00, %.lr.ph ], [ %1735, %510 ]
+ %555 = phi float [ 0.000000e+00, %.lr.ph ], [ %1738, %510 ]
+ %556 = phi float [ 0.000000e+00, %.lr.ph ], [ %1739, %510 ]
+ %557 = phi float [ 0.000000e+00, %.lr.ph ], [ %1740, %510 ]
+ %558 = phi float [ 0.000000e+00, %.lr.ph ], [ %1741, %510 ]
+ %559 = phi float [ 0.000000e+00, %.lr.ph ], [ %1696, %510 ]
+ %560 = phi float [ 0.000000e+00, %.lr.ph ], [ %1697, %510 ]
+ %561 = phi float [ 0.000000e+00, %.lr.ph ], [ %1698, %510 ]
+ %562 = phi float [ 0.000000e+00, %.lr.ph ], [ %1699, %510 ]
+ %563 = phi float [ 0.000000e+00, %.lr.ph ], [ %1702, %510 ]
+ %564 = phi float [ 0.000000e+00, %.lr.ph ], [ %1703, %510 ]
+ %565 = phi float [ 0.000000e+00, %.lr.ph ], [ %1704, %510 ]
+ %566 = phi float [ 0.000000e+00, %.lr.ph ], [ %1705, %510 ]
+ %567 = phi float [ 0.000000e+00, %.lr.ph ], [ %1708, %510 ]
+ %568 = phi float [ 0.000000e+00, %.lr.ph ], [ %1709, %510 ]
+ %569 = phi float [ 0.000000e+00, %.lr.ph ], [ %1710, %510 ]
+ %570 = phi float [ 0.000000e+00, %.lr.ph ], [ %1711, %510 ]
+ %571 = phi float [ 0.000000e+00, %.lr.ph ], [ %1714, %510 ]
+ %572 = phi float [ 0.000000e+00, %.lr.ph ], [ %1715, %510 ]
+ %573 = phi float [ 0.000000e+00, %.lr.ph ], [ %1716, %510 ]
+ %574 = phi float [ 0.000000e+00, %.lr.ph ], [ %1717, %510 ]
+ %575 = phi float [ 0.000000e+00, %.lr.ph ], [ %1772, %510 ]
+ %576 = phi float [ 0.000000e+00, %.lr.ph ], [ %1773, %510 ]
+ %577 = phi float [ 0.000000e+00, %.lr.ph ], [ %1774, %510 ]
+ %578 = phi float [ 0.000000e+00, %.lr.ph ], [ %1775, %510 ]
+ %579 = phi float [ 0.000000e+00, %.lr.ph ], [ %1778, %510 ]
+ %580 = phi float [ 0.000000e+00, %.lr.ph ], [ %1779, %510 ]
+ %581 = phi float [ 0.000000e+00, %.lr.ph ], [ %1780, %510 ]
+ %582 = phi float [ 0.000000e+00, %.lr.ph ], [ %1781, %510 ]
+ %583 = phi float [ 0.000000e+00, %.lr.ph ], [ %1784, %510 ]
+ %584 = phi float [ 0.000000e+00, %.lr.ph ], [ %1785, %510 ]
+ %585 = phi float [ 0.000000e+00, %.lr.ph ], [ %1786, %510 ]
+ %586 = phi float [ 0.000000e+00, %.lr.ph ], [ %1787, %510 ]
+ %587 = phi float [ 0.000000e+00, %.lr.ph ], [ %1790, %510 ]
+ %588 = phi float [ 0.000000e+00, %.lr.ph ], [ %1791, %510 ]
+ %589 = phi float [ 0.000000e+00, %.lr.ph ], [ %1792, %510 ]
+ %590 = phi float [ 0.000000e+00, %.lr.ph ], [ %1793, %510 ]
+ %591 = phi float [ 0.000000e+00, %.lr.ph ], [ %1744, %510 ]
+ %592 = phi float [ 0.000000e+00, %.lr.ph ], [ %1745, %510 ]
+ %593 = phi float [ 0.000000e+00, %.lr.ph ], [ %1746, %510 ]
+ %594 = phi float [ 0.000000e+00, %.lr.ph ], [ %1747, %510 ]
+ %595 = phi float [ 0.000000e+00, %.lr.ph ], [ %1750, %510 ]
+ %596 = phi float [ 0.000000e+00, %.lr.ph ], [ %1751, %510 ]
+ %597 = phi float [ 0.000000e+00, %.lr.ph ], [ %1752, %510 ]
+ %598 = phi float [ 0.000000e+00, %.lr.ph ], [ %1753, %510 ]
+ %599 = phi float [ 0.000000e+00, %.lr.ph ], [ %1756, %510 ]
+ %600 = phi float [ 0.000000e+00, %.lr.ph ], [ %1757, %510 ]
+ %601 = phi float [ 0.000000e+00, %.lr.ph ], [ %1758, %510 ]
+ %602 = phi float [ 0.000000e+00, %.lr.ph ], [ %1759, %510 ]
+ %603 = phi float [ 0.000000e+00, %.lr.ph ], [ %1762, %510 ]
+ %604 = phi float [ 0.000000e+00, %.lr.ph ], [ %1763, %510 ]
+ %605 = phi float [ 0.000000e+00, %.lr.ph ], [ %1764, %510 ]
+ %606 = phi float [ 0.000000e+00, %.lr.ph ], [ %1765, %510 ]
+ %607 = phi float [ 0.000000e+00, %.lr.ph ], [ %1668, %510 ]
+ %608 = phi float [ 0.000000e+00, %.lr.ph ], [ %1669, %510 ]
+ %609 = phi float [ 0.000000e+00, %.lr.ph ], [ %1670, %510 ]
+ %610 = phi float [ 0.000000e+00, %.lr.ph ], [ %1671, %510 ]
+ %611 = phi float [ 0.000000e+00, %.lr.ph ], [ %1674, %510 ]
+ %612 = phi float [ 0.000000e+00, %.lr.ph ], [ %1675, %510 ]
+ %613 = phi float [ 0.000000e+00, %.lr.ph ], [ %1676, %510 ]
+ %614 = phi float [ 0.000000e+00, %.lr.ph ], [ %1677, %510 ]
+ %615 = phi float [ 0.000000e+00, %.lr.ph ], [ %1680, %510 ]
+ %616 = phi float [ 0.000000e+00, %.lr.ph ], [ %1681, %510 ]
+ %617 = phi float [ 0.000000e+00, %.lr.ph ], [ %1682, %510 ]
+ %618 = phi float [ 0.000000e+00, %.lr.ph ], [ %1683, %510 ]
+ %619 = phi float [ 0.000000e+00, %.lr.ph ], [ %1686, %510 ]
+ %620 = phi float [ 0.000000e+00, %.lr.ph ], [ %1687, %510 ]
+ %621 = phi float [ 0.000000e+00, %.lr.ph ], [ %1688, %510 ]
+ %622 = phi float [ 0.000000e+00, %.lr.ph ], [ %1689, %510 ]
+ %623 = phi float [ 0.000000e+00, %.lr.ph ], [ %1644, %510 ]
+ %624 = phi float [ 0.000000e+00, %.lr.ph ], [ %1645, %510 ]
+ %625 = phi float [ 0.000000e+00, %.lr.ph ], [ %1646, %510 ]
+ %626 = phi float [ 0.000000e+00, %.lr.ph ], [ %1647, %510 ]
+ %627 = phi float [ 0.000000e+00, %.lr.ph ], [ %1650, %510 ]
+ %628 = phi float [ 0.000000e+00, %.lr.ph ], [ %1651, %510 ]
+ %629 = phi float [ 0.000000e+00, %.lr.ph ], [ %1652, %510 ]
+ %630 = phi float [ 0.000000e+00, %.lr.ph ], [ %1653, %510 ]
+ %631 = phi float [ 0.000000e+00, %.lr.ph ], [ %1656, %510 ]
+ %632 = phi float [ 0.000000e+00, %.lr.ph ], [ %1657, %510 ]
+ %633 = phi float [ 0.000000e+00, %.lr.ph ], [ %1658, %510 ]
+ %634 = phi float [ 0.000000e+00, %.lr.ph ], [ %1659, %510 ]
+ %635 = phi float [ 0.000000e+00, %.lr.ph ], [ %1662, %510 ]
+ %636 = phi float [ 0.000000e+00, %.lr.ph ], [ %1663, %510 ]
+ %637 = phi float [ 0.000000e+00, %.lr.ph ], [ %1664, %510 ]
+ %638 = phi float [ 0.000000e+00, %.lr.ph ], [ %1665, %510 ]
+ %639 = phi float [ 0.000000e+00, %.lr.ph ], [ %1558, %510 ]
+ %640 = phi float [ 0.000000e+00, %.lr.ph ], [ %1559, %510 ]
+ %641 = phi float [ 0.000000e+00, %.lr.ph ], [ %1560, %510 ]
+ %642 = phi float [ 0.000000e+00, %.lr.ph ], [ %1561, %510 ]
+ %643 = phi float [ 0.000000e+00, %.lr.ph ], [ %1564, %510 ]
+ %644 = phi float [ 0.000000e+00, %.lr.ph ], [ %1565, %510 ]
+ %645 = phi float [ 0.000000e+00, %.lr.ph ], [ %1566, %510 ]
+ %646 = phi float [ 0.000000e+00, %.lr.ph ], [ %1567, %510 ]
+ %647 = phi float [ 0.000000e+00, %.lr.ph ], [ %1570, %510 ]
+ %648 = phi float [ 0.000000e+00, %.lr.ph ], [ %1571, %510 ]
+ %649 = phi float [ 0.000000e+00, %.lr.ph ], [ %1572, %510 ]
+ %650 = phi float [ 0.000000e+00, %.lr.ph ], [ %1573, %510 ]
+ %651 = phi float [ 0.000000e+00, %.lr.ph ], [ %1576, %510 ]
+ %652 = phi float [ 0.000000e+00, %.lr.ph ], [ %1577, %510 ]
+ %653 = phi float [ 0.000000e+00, %.lr.ph ], [ %1578, %510 ]
+ %654 = phi float [ 0.000000e+00, %.lr.ph ], [ %1579, %510 ]
+ %655 = phi float [ 0.000000e+00, %.lr.ph ], [ %1534, %510 ]
+ %656 = phi float [ 0.000000e+00, %.lr.ph ], [ %1535, %510 ]
+ %657 = phi float [ 0.000000e+00, %.lr.ph ], [ %1536, %510 ]
+ %658 = phi float [ 0.000000e+00, %.lr.ph ], [ %1537, %510 ]
+ %659 = phi float [ 0.000000e+00, %.lr.ph ], [ %1540, %510 ]
+ %660 = phi float [ 0.000000e+00, %.lr.ph ], [ %1541, %510 ]
+ %661 = phi float [ 0.000000e+00, %.lr.ph ], [ %1542, %510 ]
+ %662 = phi float [ 0.000000e+00, %.lr.ph ], [ %1543, %510 ]
+ %663 = phi float [ 0.000000e+00, %.lr.ph ], [ %1546, %510 ]
+ %664 = phi float [ 0.000000e+00, %.lr.ph ], [ %1547, %510 ]
+ %665 = phi float [ 0.000000e+00, %.lr.ph ], [ %1548, %510 ]
+ %666 = phi float [ 0.000000e+00, %.lr.ph ], [ %1549, %510 ]
+ %667 = phi float [ 0.000000e+00, %.lr.ph ], [ %1552, %510 ]
+ %668 = phi float [ 0.000000e+00, %.lr.ph ], [ %1553, %510 ]
+ %669 = phi float [ 0.000000e+00, %.lr.ph ], [ %1554, %510 ]
+ %670 = phi float [ 0.000000e+00, %.lr.ph ], [ %1555, %510 ]
+ %671 = phi float [ 0.000000e+00, %.lr.ph ], [ %1396, %510 ]
+ %672 = phi float [ 0.000000e+00, %.lr.ph ], [ %1397, %510 ]
+ %673 = phi float [ 0.000000e+00, %.lr.ph ], [ %1398, %510 ]
+ %674 = phi float [ 0.000000e+00, %.lr.ph ], [ %1399, %510 ]
+ %675 = phi float [ 0.000000e+00, %.lr.ph ], [ %1402, %510 ]
+ %676 = phi float [ 0.000000e+00, %.lr.ph ], [ %1403, %510 ]
+ %677 = phi float [ 0.000000e+00, %.lr.ph ], [ %1404, %510 ]
+ %678 = phi float [ 0.000000e+00, %.lr.ph ], [ %1405, %510 ]
+ %679 = phi float [ 0.000000e+00, %.lr.ph ], [ %1408, %510 ]
+ %680 = phi float [ 0.000000e+00, %.lr.ph ], [ %1409, %510 ]
+ %681 = phi float [ 0.000000e+00, %.lr.ph ], [ %1410, %510 ]
+ %682 = phi float [ 0.000000e+00, %.lr.ph ], [ %1411, %510 ]
+ %683 = phi float [ 0.000000e+00, %.lr.ph ], [ %1414, %510 ]
+ %684 = phi float [ 0.000000e+00, %.lr.ph ], [ %1415, %510 ]
+ %685 = phi float [ 0.000000e+00, %.lr.ph ], [ %1416, %510 ]
+ %686 = phi float [ 0.000000e+00, %.lr.ph ], [ %1417, %510 ]
+ %687 = phi float [ 0.000000e+00, %.lr.ph ], [ %1372, %510 ]
+ %688 = phi float [ 0.000000e+00, %.lr.ph ], [ %1373, %510 ]
+ %689 = phi float [ 0.000000e+00, %.lr.ph ], [ %1374, %510 ]
+ %690 = phi float [ 0.000000e+00, %.lr.ph ], [ %1375, %510 ]
+ %691 = phi float [ 0.000000e+00, %.lr.ph ], [ %1378, %510 ]
+ %692 = phi float [ 0.000000e+00, %.lr.ph ], [ %1379, %510 ]
+ %693 = phi float [ 0.000000e+00, %.lr.ph ], [ %1380, %510 ]
+ %694 = phi float [ 0.000000e+00, %.lr.ph ], [ %1381, %510 ]
+ %695 = phi float [ 0.000000e+00, %.lr.ph ], [ %1384, %510 ]
+ %696 = phi float [ 0.000000e+00, %.lr.ph ], [ %1385, %510 ]
+ %697 = phi float [ 0.000000e+00, %.lr.ph ], [ %1386, %510 ]
+ %698 = phi float [ 0.000000e+00, %.lr.ph ], [ %1387, %510 ]
+ %699 = phi float [ 0.000000e+00, %.lr.ph ], [ %1390, %510 ]
+ %700 = phi float [ 0.000000e+00, %.lr.ph ], [ %1391, %510 ]
+ %701 = phi float [ 0.000000e+00, %.lr.ph ], [ %1392, %510 ]
+ %702 = phi float [ 0.000000e+00, %.lr.ph ], [ %1393, %510 ]
+ %703 = phi float [ 0.000000e+00, %.lr.ph ], [ %1510, %510 ]
+ %704 = phi float [ 0.000000e+00, %.lr.ph ], [ %1511, %510 ]
+ %705 = phi float [ 0.000000e+00, %.lr.ph ], [ %1512, %510 ]
+ %706 = phi float [ 0.000000e+00, %.lr.ph ], [ %1513, %510 ]
+ %707 = phi float [ 0.000000e+00, %.lr.ph ], [ %1516, %510 ]
+ %708 = phi float [ 0.000000e+00, %.lr.ph ], [ %1517, %510 ]
+ %709 = phi float [ 0.000000e+00, %.lr.ph ], [ %1518, %510 ]
+ %710 = phi float [ 0.000000e+00, %.lr.ph ], [ %1519, %510 ]
+ %711 = phi float [ 0.000000e+00, %.lr.ph ], [ %1522, %510 ]
+ %712 = phi float [ 0.000000e+00, %.lr.ph ], [ %1523, %510 ]
+ %713 = phi float [ 0.000000e+00, %.lr.ph ], [ %1524, %510 ]
+ %714 = phi float [ 0.000000e+00, %.lr.ph ], [ %1525, %510 ]
+ %715 = phi float [ 0.000000e+00, %.lr.ph ], [ %1528, %510 ]
+ %716 = phi float [ 0.000000e+00, %.lr.ph ], [ %1529, %510 ]
+ %717 = phi float [ 0.000000e+00, %.lr.ph ], [ %1530, %510 ]
+ %718 = phi float [ 0.000000e+00, %.lr.ph ], [ %1531, %510 ]
+ %719 = phi float [ 0.000000e+00, %.lr.ph ], [ %1482, %510 ]
+ %720 = phi float [ 0.000000e+00, %.lr.ph ], [ %1483, %510 ]
+ %721 = phi float [ 0.000000e+00, %.lr.ph ], [ %1484, %510 ]
+ %722 = phi float [ 0.000000e+00, %.lr.ph ], [ %1485, %510 ]
+ %723 = phi float [ 0.000000e+00, %.lr.ph ], [ %1488, %510 ]
+ %724 = phi float [ 0.000000e+00, %.lr.ph ], [ %1489, %510 ]
+ %725 = phi float [ 0.000000e+00, %.lr.ph ], [ %1490, %510 ]
+ %726 = phi float [ 0.000000e+00, %.lr.ph ], [ %1491, %510 ]
+ %727 = phi float [ 0.000000e+00, %.lr.ph ], [ %1494, %510 ]
+ %728 = phi float [ 0.000000e+00, %.lr.ph ], [ %1495, %510 ]
+ %729 = phi float [ 0.000000e+00, %.lr.ph ], [ %1496, %510 ]
+ %730 = phi float [ 0.000000e+00, %.lr.ph ], [ %1497, %510 ]
+ %731 = phi float [ 0.000000e+00, %.lr.ph ], [ %1500, %510 ]
+ %732 = phi float [ 0.000000e+00, %.lr.ph ], [ %1501, %510 ]
+ %733 = phi float [ 0.000000e+00, %.lr.ph ], [ %1502, %510 ]
+ %734 = phi float [ 0.000000e+00, %.lr.ph ], [ %1503, %510 ]
+ %735 = phi float [ 0.000000e+00, %.lr.ph ], [ %1340, %510 ]
+ %736 = phi float [ 0.000000e+00, %.lr.ph ], [ %1341, %510 ]
+ %737 = phi float [ 0.000000e+00, %.lr.ph ], [ %1342, %510 ]
+ %738 = phi float [ 0.000000e+00, %.lr.ph ], [ %1343, %510 ]
+ %739 = phi float [ 0.000000e+00, %.lr.ph ], [ %1346, %510 ]
+ %740 = phi float [ 0.000000e+00, %.lr.ph ], [ %1347, %510 ]
+ %741 = phi float [ 0.000000e+00, %.lr.ph ], [ %1348, %510 ]
+ %742 = phi float [ 0.000000e+00, %.lr.ph ], [ %1349, %510 ]
+ %743 = phi float [ 0.000000e+00, %.lr.ph ], [ %1352, %510 ]
+ %744 = phi float [ 0.000000e+00, %.lr.ph ], [ %1353, %510 ]
+ %745 = phi float [ 0.000000e+00, %.lr.ph ], [ %1354, %510 ]
+ %746 = phi float [ 0.000000e+00, %.lr.ph ], [ %1355, %510 ]
+ %747 = phi float [ 0.000000e+00, %.lr.ph ], [ %1358, %510 ]
+ %748 = phi float [ 0.000000e+00, %.lr.ph ], [ %1359, %510 ]
+ %749 = phi float [ 0.000000e+00, %.lr.ph ], [ %1360, %510 ]
+ %750 = phi float [ 0.000000e+00, %.lr.ph ], [ %1361, %510 ]
+ %751 = phi ptr addrspace(1) [ %138, %.lr.ph ], [ %1620, %510 ]
+ %752 = phi ptr addrspace(1) [ %64, %.lr.ph ], [ %1458, %510 ]
+ %753 = phi float [ 0.000000e+00, %.lr.ph ], [ %1308, %510 ]
+ %754 = phi float [ 0.000000e+00, %.lr.ph ], [ %1309, %510 ]
+ %755 = phi float [ 0.000000e+00, %.lr.ph ], [ %1310, %510 ]
+ %756 = phi float [ 0.000000e+00, %.lr.ph ], [ %1311, %510 ]
+ %757 = phi float [ 0.000000e+00, %.lr.ph ], [ %1314, %510 ]
+ %758 = phi float [ 0.000000e+00, %.lr.ph ], [ %1315, %510 ]
+ %759 = phi float [ 0.000000e+00, %.lr.ph ], [ %1316, %510 ]
+ %760 = phi float [ 0.000000e+00, %.lr.ph ], [ %1317, %510 ]
+ %761 = phi float [ 0.000000e+00, %.lr.ph ], [ %1320, %510 ]
+ %762 = phi float [ 0.000000e+00, %.lr.ph ], [ %1321, %510 ]
+ %763 = phi float [ 0.000000e+00, %.lr.ph ], [ %1322, %510 ]
+ %764 = phi float [ 0.000000e+00, %.lr.ph ], [ %1323, %510 ]
+ %765 = phi float [ 0.000000e+00, %.lr.ph ], [ %1326, %510 ]
+ %766 = phi float [ 0.000000e+00, %.lr.ph ], [ %1327, %510 ]
+ %767 = phi float [ 0.000000e+00, %.lr.ph ], [ %1328, %510 ]
+ %768 = phi float [ 0.000000e+00, %.lr.ph ], [ %1329, %510 ]
+ %769 = phi i32 [ 0, %.lr.ph ], [ %1846, %510 ]
+ %770 = phi <2 x half> [ %414, %.lr.ph ], [ %1910, %510 ]
+ %771 = phi <2 x half> [ %415, %.lr.ph ], [ %1909, %510 ]
+ %772 = phi <2 x half> [ %416, %.lr.ph ], [ %1908, %510 ]
+ %773 = phi <2 x half> [ %417, %.lr.ph ], [ %1907, %510 ]
+ %774 = phi <2 x half> [ %418, %.lr.ph ], [ %1906, %510 ]
+ %775 = phi <2 x half> [ %419, %.lr.ph ], [ %1905, %510 ]
+ %776 = phi <2 x half> [ %420, %.lr.ph ], [ %1904, %510 ]
+ %777 = phi <2 x half> [ %421, %.lr.ph ], [ %1903, %510 ]
+ %778 = phi <2 x half> [ %422, %.lr.ph ], [ %1902, %510 ]
+ %779 = phi <2 x half> [ %423, %.lr.ph ], [ %1901, %510 ]
+ %780 = phi <2 x half> [ %424, %.lr.ph ], [ %1900, %510 ]
+ %781 = phi <2 x half> [ %425, %.lr.ph ], [ %1899, %510 ]
+ %782 = phi <2 x half> [ %426, %.lr.ph ], [ %1898, %510 ]
+ %783 = phi <2 x half> [ %427, %.lr.ph ], [ %1897, %510 ]
+ %784 = phi <2 x half> [ %428, %.lr.ph ], [ %1896, %510 ]
+ %785 = phi <2 x half> [ %429, %.lr.ph ], [ %1895, %510 ]
+ %786 = phi <2 x half> [ %430, %.lr.ph ], [ %1894, %510 ]
+ %787 = phi <2 x half> [ %431, %.lr.ph ], [ %1893, %510 ]
+ %788 = phi <2 x half> [ %432, %.lr.ph ], [ %1892, %510 ]
+ %789 = phi <2 x half> [ %433, %.lr.ph ], [ %1891, %510 ]
+ %790 = phi <2 x half> [ %434, %.lr.ph ], [ %1890, %510 ]
+ %791 = phi <2 x half> [ %435, %.lr.ph ], [ %1889, %510 ]
+ %792 = phi <2 x half> [ %436, %.lr.ph ], [ %1888, %510 ]
+ %793 = phi <2 x half> [ %437, %.lr.ph ], [ %1887, %510 ]
+ %794 = phi <2 x half> [ %438, %.lr.ph ], [ %1886, %510 ]
+ %795 = phi <2 x half> [ %439, %.lr.ph ], [ %1885, %510 ]
+ %796 = phi <2 x half> [ %440, %.lr.ph ], [ %1884, %510 ]
+ %797 = phi <2 x half> [ %441, %.lr.ph ], [ %1883, %510 ]
+ %798 = phi <2 x half> [ %442, %.lr.ph ], [ %1882, %510 ]
+ %799 = phi <2 x half> [ %443, %.lr.ph ], [ %1881, %510 ]
+ %800 = phi <2 x half> [ %444, %.lr.ph ], [ %1880, %510 ]
+ %801 = phi <2 x half> [ %445, %.lr.ph ], [ %1879, %510 ]
+ %802 = phi <2 x half> [ %446, %.lr.ph ], [ %1878, %510 ]
+ %803 = phi <2 x half> [ %447, %.lr.ph ], [ %1942, %510 ]
+ %804 = phi <2 x half> [ %448, %.lr.ph ], [ %1941, %510 ]
+ %805 = phi <2 x half> [ %449, %.lr.ph ], [ %1877, %510 ]
+ %806 = phi <2 x half> [ %450, %.lr.ph ], [ %1876, %510 ]
+ %807 = phi <2 x half> [ %451, %.lr.ph ], [ %1940, %510 ]
+ %808 = phi <2 x half> [ %452, %.lr.ph ], [ %1939, %510 ]
+ %809 = phi <2 x half> [ %453, %.lr.ph ], [ %1875, %510 ]
+ %810 = phi <2 x half> [ %454, %.lr.ph ], [ %1874, %510 ]
+ %811 = phi <2 x half> [ %455, %.lr.ph ], [ %1938, %510 ]
+ %812 = phi <2 x half> [ %456, %.lr.ph ], [ %1937, %510 ]
+ %813 = phi <2 x half> [ %457, %.lr.ph ], [ %1873, %510 ]
+ %814 = phi <2 x half> [ %458, %.lr.ph ], [ %1872, %510 ]
+ %815 = phi <2 x half> [ %459, %.lr.ph ], [ %1936, %510 ]
+ %816 = phi <2 x half> [ %460, %.lr.ph ], [ %1935, %510 ]
+ %817 = phi <2 x half> [ %461, %.lr.ph ], [ %1871, %510 ]
+ %818 = phi <2 x half> [ %462, %.lr.ph ], [ %1870, %510 ]
+ %819 = phi <2 x half> [ %463, %.lr.ph ], [ %1934, %510 ]
+ %820 = phi <2 x half> [ %464, %.lr.ph ], [ %1933, %510 ]
+ %821 = phi <2 x half> [ %465, %.lr.ph ], [ %1869, %510 ]
+ %822 = phi <2 x half> [ %466, %.lr.ph ], [ %1868, %510 ]
+ %823 = phi <2 x half> [ %467, %.lr.ph ], [ %1932, %510 ]
+ %824 = phi <2 x half> [ %468, %.lr.ph ], [ %1931, %510 ]
+ %825 = phi <2 x half> [ %469, %.lr.ph ], [ %1867, %510 ]
+ %826 = phi <2 x half> [ %470, %.lr.ph ], [ %1866, %510 ]
+ %827 = phi <2 x half> [ %471, %.lr.ph ], [ %1930, %510 ]
+ %828 = phi <2 x half> [ %472, %.lr.ph ], [ %1929, %510 ]
+ %829 = phi <2 x half> [ %473, %.lr.ph ], [ %1865, %510 ]
+ %830 = phi <2 x half> [ %474, %.lr.ph ], [ %1864, %510 ]
+ %831 = phi <2 x half> [ %475, %.lr.ph ], [ %1928, %510 ]
+ %832 = phi <2 x half> [ %476, %.lr.ph ], [ %1927, %510 ]
+ %833 = phi <2 x half> [ %477, %.lr.ph ], [ %1863, %510 ]
+ %834 = phi <2 x half> [ %478, %.lr.ph ], [ %1862, %510 ]
+ %835 = phi <2 x half> [ %479, %.lr.ph ], [ %1926, %510 ]
+ %836 = phi <2 x half> [ %480, %.lr.ph ], [ %1925, %510 ]
+ %837 = phi <2 x half> [ %481, %.lr.ph ], [ %1861, %510 ]
+ %838 = phi <2 x half> [ %482, %.lr.ph ], [ %1860, %510 ]
+ %839 = phi <2 x half> [ %483, %.lr.ph ], [ %1924, %510 ]
+ %840 = phi <2 x half> [ %484, %.lr.ph ], [ %1923, %510 ]
+ %841 = phi <2 x half> [ %485, %.lr.ph ], [ %1859, %510 ]
+ %842 = phi <2 x half> [ %486, %.lr.ph ], [ %1858, %510 ]
+ %843 = phi <2 x half> [ %487, %.lr.ph ], [ %1922, %510 ]
+ %844 = phi <2 x half> [ %488, %.lr.ph ], [ %1921, %510 ]
+ %845 = phi <2 x half> [ %489, %.lr.ph ], [ %1857, %510 ]
+ %846 = phi <2 x half> [ %490, %.lr.ph ], [ %1856, %510 ]
+ %847 = phi <2 x half> [ %491, %.lr.ph ], [ %1920, %510 ]
+ %848 = phi <2 x half> [ %492, %.lr.ph ], [ %1919, %510 ]
+ %849 = phi <2 x half> [ %493, %.lr.ph ], [ %1855, %510 ]
+ %850 = phi <2 x half> [ %494, %.lr.ph ], [ %1854, %510 ]
+ %851 = phi <2 x half> [ %495, %.lr.ph ], [ %1918, %510 ]
+ %852 = phi <2 x half> [ %496, %.lr.ph ], [ %1917, %510 ]
+ %853 = phi <2 x half> [ %497, %.lr.ph ], [ %1853, %510 ]
+ %854 = phi <2 x half> [ %498, %.lr.ph ], [ %1852, %510 ]
+ %855 = phi <2 x half> [ %499, %.lr.ph ], [ %1916, %510 ]
+ %856 = phi <2 x half> [ %500, %.lr.ph ], [ %1915, %510 ]
+ %857 = phi <2 x half> [ %501, %.lr.ph ], [ %1851, %510 ]
+ %858 = phi <2 x half> [ %502, %.lr.ph ], [ %1850, %510 ]
+ %859 = phi <2 x half> [ %503, %.lr.ph ], [ %1914, %510 ]
+ %860 = phi <2 x half> [ %504, %.lr.ph ], [ %1913, %510 ]
+ %861 = phi <2 x half> [ %505, %.lr.ph ], [ %1849, %510 ]
+ %862 = phi <2 x half> [ %506, %.lr.ph ], [ %1848, %510 ]
+ %863 = phi <2 x half> [ %507, %.lr.ph ], [ %1912, %510 ]
+ %864 = phi <2 x half> [ %508, %.lr.ph ], [ %1911, %510 ]
+ %865 = phi <2 x half> [ %509, %.lr.ph ], [ %1847, %510 ]
+ %866 = shufflevector <2 x half> %801, <2 x half> %800, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %867 = shufflevector <2 x half> %799, <2 x half> %798, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %868 = shufflevector <2 x half> %797, <2 x half> %796, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %869 = shufflevector <2 x half> %795, <2 x half> %794, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %870 = shufflevector <2 x half> %793, <2 x half> %792, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %871 = shufflevector <2 x half> %791, <2 x half> %790, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %872 = shufflevector <2 x half> %789, <2 x half> %788, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %873 = shufflevector <2 x half> %787, <2 x half> %786, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %874 = insertelement <4 x float> poison, float %753, i64 0
+ %875 = insertelement <4 x float> %874, float %754, i64 1
+ %876 = insertelement <4 x float> %875, float %755, i64 2
+ %877 = insertelement <4 x float> %876, float %756, i64 3
+ %878 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %866, <4 x float> %877, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %879 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %867, <4 x float> %878, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %880 = insertelement <4 x float> poison, float %757, i64 0
+ %881 = insertelement <4 x float> %880, float %758, i64 1
+ %882 = insertelement <4 x float> %881, float %759, i64 2
+ %883 = insertelement <4 x float> %882, float %760, i64 3
+ %884 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %866, <4 x float> %883, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %885 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %867, <4 x float> %884, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %886 = insertelement <4 x float> poison, float %761, i64 0
+ %887 = insertelement <4 x float> %886, float %762, i64 1
+ %888 = insertelement <4 x float> %887, float %763, i64 2
+ %889 = insertelement <4 x float> %888, float %764, i64 3
+ %890 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %868, <4 x float> %889, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %891 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %869, <4 x float> %890, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %892 = insertelement <4 x float> poison, float %765, i64 0
+ %893 = insertelement <4 x float> %892, float %766, i64 1
+ %894 = insertelement <4 x float> %893, float %767, i64 2
+ %895 = insertelement <4 x float> %894, float %768, i64 3
+ %896 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %868, <4 x float> %895, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %897 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %869, <4 x float> %896, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !7
+ %898 = shufflevector <2 x half> %777, <2 x half> %776, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %899 = shufflevector <2 x half> %775, <2 x half> %774, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %900 = shufflevector <2 x half> %773, <2 x half> %772, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %901 = shufflevector <2 x half> %771, <2 x half> %770, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %902 = insertelement <4 x float> poison, float %735, i64 0
+ %903 = insertelement <4 x float> %902, float %736, i64 1
+ %904 = insertelement <4 x float> %903, float %737, i64 2
+ %905 = insertelement <4 x float> %904, float %738, i64 3
+ %906 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %866, <4 x float> %905, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %907 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %867, <4 x float> %906, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %908 = insertelement <4 x float> poison, float %739, i64 0
+ %909 = insertelement <4 x float> %908, float %740, i64 1
+ %910 = insertelement <4 x float> %909, float %741, i64 2
+ %911 = insertelement <4 x float> %910, float %742, i64 3
+ %912 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %866, <4 x float> %911, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %913 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %867, <4 x float> %912, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %914 = insertelement <4 x float> poison, float %743, i64 0
+ %915 = insertelement <4 x float> %914, float %744, i64 1
+ %916 = insertelement <4 x float> %915, float %745, i64 2
+ %917 = insertelement <4 x float> %916, float %746, i64 3
+ %918 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %868, <4 x float> %917, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %919 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %869, <4 x float> %918, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %920 = insertelement <4 x float> poison, float %747, i64 0
+ %921 = insertelement <4 x float> %920, float %748, i64 1
+ %922 = insertelement <4 x float> %921, float %749, i64 2
+ %923 = insertelement <4 x float> %922, float %750, i64 3
+ %924 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %868, <4 x float> %923, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %925 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %869, <4 x float> %924, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !8
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !9
+ %926 = load <8 x half>, ptr addrspace(3) %362, align 16
+ %927 = load <8 x half>, ptr addrspace(3) %363, align 16
+ %928 = load <8 x half>, ptr addrspace(3) %366, align 16
+ %929 = load <8 x half>, ptr addrspace(3) %367, align 16
+ %930 = shufflevector <2 x half> %785, <2 x half> %784, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %931 = shufflevector <2 x half> %783, <2 x half> %782, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %932 = shufflevector <2 x half> %781, <2 x half> %780, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %933 = shufflevector <2 x half> %779, <2 x half> %778, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %934 = insertelement <4 x float> poison, float %687, i64 0
+ %935 = insertelement <4 x float> %934, float %688, i64 1
+ %936 = insertelement <4 x float> %935, float %689, i64 2
+ %937 = insertelement <4 x float> %936, float %690, i64 3
+ %938 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %930, <4 x float> %937, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %939 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %931, <4 x float> %938, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %940 = insertelement <4 x float> poison, float %691, i64 0
+ %941 = insertelement <4 x float> %940, float %692, i64 1
+ %942 = insertelement <4 x float> %941, float %693, i64 2
+ %943 = insertelement <4 x float> %942, float %694, i64 3
+ %944 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %930, <4 x float> %943, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %945 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %931, <4 x float> %944, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %946 = insertelement <4 x float> poison, float %695, i64 0
+ %947 = insertelement <4 x float> %946, float %696, i64 1
+ %948 = insertelement <4 x float> %947, float %697, i64 2
+ %949 = insertelement <4 x float> %948, float %698, i64 3
+ %950 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %932, <4 x float> %949, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %951 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %933, <4 x float> %950, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %952 = insertelement <4 x float> poison, float %699, i64 0
+ %953 = insertelement <4 x float> %952, float %700, i64 1
+ %954 = insertelement <4 x float> %953, float %701, i64 2
+ %955 = insertelement <4 x float> %954, float %702, i64 3
+ %956 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %932, <4 x float> %955, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %957 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %933, <4 x float> %956, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !10
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !11
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !12
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !13
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !14
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !15
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !16
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !17
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !18
+ %958 = insertelement <4 x float> poison, float %671, i64 0
+ %959 = insertelement <4 x float> %958, float %672, i64 1
+ %960 = insertelement <4 x float> %959, float %673, i64 2
+ %961 = insertelement <4 x float> %960, float %674, i64 3
+ %962 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %930, <4 x float> %961, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %963 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %931, <4 x float> %962, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %964 = insertelement <4 x float> poison, float %675, i64 0
+ %965 = insertelement <4 x float> %964, float %676, i64 1
+ %966 = insertelement <4 x float> %965, float %677, i64 2
+ %967 = insertelement <4 x float> %966, float %678, i64 3
+ %968 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %930, <4 x float> %967, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %969 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %931, <4 x float> %968, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %970 = insertelement <4 x float> poison, float %679, i64 0
+ %971 = insertelement <4 x float> %970, float %680, i64 1
+ %972 = insertelement <4 x float> %971, float %681, i64 2
+ %973 = insertelement <4 x float> %972, float %682, i64 3
+ %974 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %932, <4 x float> %973, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %975 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %933, <4 x float> %974, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %976 = insertelement <4 x float> poison, float %683, i64 0
+ %977 = insertelement <4 x float> %976, float %684, i64 1
+ %978 = insertelement <4 x float> %977, float %685, i64 2
+ %979 = insertelement <4 x float> %978, float %686, i64 3
+ %980 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %932, <4 x float> %979, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %981 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %933, <4 x float> %980, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !19
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !20
+ %982 = shufflevector <8 x half> %926, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %983 = shufflevector <8 x half> %926, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %984 = shufflevector <8 x half> %927, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %985 = shufflevector <8 x half> %927, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %986 = insertelement <4 x float> poison, float %719, i64 0
+ %987 = insertelement <4 x float> %986, float %720, i64 1
+ %988 = insertelement <4 x float> %987, float %721, i64 2
+ %989 = insertelement <4 x float> %988, float %722, i64 3
+ %990 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %866, <4 x float> %989, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %991 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %867, <4 x float> %990, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %992 = insertelement <4 x float> poison, float %723, i64 0
+ %993 = insertelement <4 x float> %992, float %724, i64 1
+ %994 = insertelement <4 x float> %993, float %725, i64 2
+ %995 = insertelement <4 x float> %994, float %726, i64 3
+ %996 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %866, <4 x float> %995, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %997 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %867, <4 x float> %996, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %998 = insertelement <4 x float> poison, float %727, i64 0
+ %999 = insertelement <4 x float> %998, float %728, i64 1
+ %1000 = insertelement <4 x float> %999, float %729, i64 2
+ %1001 = insertelement <4 x float> %1000, float %730, i64 3
+ %1002 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %868, <4 x float> %1001, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1003 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %869, <4 x float> %1002, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1004 = insertelement <4 x float> poison, float %731, i64 0
+ %1005 = insertelement <4 x float> %1004, float %732, i64 1
+ %1006 = insertelement <4 x float> %1005, float %733, i64 2
+ %1007 = insertelement <4 x float> %1006, float %734, i64 3
+ %1008 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %868, <4 x float> %1007, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1009 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %869, <4 x float> %1008, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !21
+ %1010 = shufflevector <8 x half> %928, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1011 = shufflevector <8 x half> %928, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1012 = shufflevector <8 x half> %929, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1013 = shufflevector <8 x half> %929, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1014 = insertelement <4 x float> poison, float %703, i64 0
+ %1015 = insertelement <4 x float> %1014, float %704, i64 1
+ %1016 = insertelement <4 x float> %1015, float %705, i64 2
+ %1017 = insertelement <4 x float> %1016, float %706, i64 3
+ %1018 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %866, <4 x float> %1017, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1019 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %867, <4 x float> %1018, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1020 = insertelement <4 x float> poison, float %707, i64 0
+ %1021 = insertelement <4 x float> %1020, float %708, i64 1
+ %1022 = insertelement <4 x float> %1021, float %709, i64 2
+ %1023 = insertelement <4 x float> %1022, float %710, i64 3
+ %1024 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %866, <4 x float> %1023, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1025 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %867, <4 x float> %1024, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1026 = insertelement <4 x float> poison, float %711, i64 0
+ %1027 = insertelement <4 x float> %1026, float %712, i64 1
+ %1028 = insertelement <4 x float> %1027, float %713, i64 2
+ %1029 = insertelement <4 x float> %1028, float %714, i64 3
+ %1030 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %868, <4 x float> %1029, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1031 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %869, <4 x float> %1030, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1032 = insertelement <4 x float> poison, float %715, i64 0
+ %1033 = insertelement <4 x float> %1032, float %716, i64 1
+ %1034 = insertelement <4 x float> %1033, float %717, i64 2
+ %1035 = insertelement <4 x float> %1034, float %718, i64 3
+ %1036 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %868, <4 x float> %1035, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1037 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %869, <4 x float> %1036, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !22
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !23
+ %1038 = load <8 x half>, ptr addrspace(3) %372, align 16
+ %1039 = load <8 x half>, ptr addrspace(3) %373, align 16
+ %1040 = load <8 x half>, ptr addrspace(3) %378, align 16
+ %1041 = load <8 x half>, ptr addrspace(3) %379, align 16
+ %1042 = insertelement <4 x float> poison, float %655, i64 0
+ %1043 = insertelement <4 x float> %1042, float %656, i64 1
+ %1044 = insertelement <4 x float> %1043, float %657, i64 2
+ %1045 = insertelement <4 x float> %1044, float %658, i64 3
+ %1046 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %930, <4 x float> %1045, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1047 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %931, <4 x float> %1046, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1048 = insertelement <4 x float> poison, float %659, i64 0
+ %1049 = insertelement <4 x float> %1048, float %660, i64 1
+ %1050 = insertelement <4 x float> %1049, float %661, i64 2
+ %1051 = insertelement <4 x float> %1050, float %662, i64 3
+ %1052 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %930, <4 x float> %1051, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1053 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %931, <4 x float> %1052, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1054 = insertelement <4 x float> poison, float %663, i64 0
+ %1055 = insertelement <4 x float> %1054, float %664, i64 1
+ %1056 = insertelement <4 x float> %1055, float %665, i64 2
+ %1057 = insertelement <4 x float> %1056, float %666, i64 3
+ %1058 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %932, <4 x float> %1057, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1059 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %933, <4 x float> %1058, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1060 = insertelement <4 x float> poison, float %667, i64 0
+ %1061 = insertelement <4 x float> %1060, float %668, i64 1
+ %1062 = insertelement <4 x float> %1061, float %669, i64 2
+ %1063 = insertelement <4 x float> %1062, float %670, i64 3
+ %1064 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %932, <4 x float> %1063, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1065 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %933, <4 x float> %1064, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !24
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !25
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !26
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !27
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !28
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !29
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !30
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !31
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !32
+ %1066 = insertelement <4 x float> poison, float %639, i64 0
+ %1067 = insertelement <4 x float> %1066, float %640, i64 1
+ %1068 = insertelement <4 x float> %1067, float %641, i64 2
+ %1069 = insertelement <4 x float> %1068, float %642, i64 3
+ %1070 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %930, <4 x float> %1069, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1071 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %931, <4 x float> %1070, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1072 = insertelement <4 x float> poison, float %643, i64 0
+ %1073 = insertelement <4 x float> %1072, float %644, i64 1
+ %1074 = insertelement <4 x float> %1073, float %645, i64 2
+ %1075 = insertelement <4 x float> %1074, float %646, i64 3
+ %1076 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %930, <4 x float> %1075, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1077 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %931, <4 x float> %1076, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1078 = insertelement <4 x float> poison, float %647, i64 0
+ %1079 = insertelement <4 x float> %1078, float %648, i64 1
+ %1080 = insertelement <4 x float> %1079, float %649, i64 2
+ %1081 = insertelement <4 x float> %1080, float %650, i64 3
+ %1082 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %932, <4 x float> %1081, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1083 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %933, <4 x float> %1082, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1084 = insertelement <4 x float> poison, float %651, i64 0
+ %1085 = insertelement <4 x float> %1084, float %652, i64 1
+ %1086 = insertelement <4 x float> %1085, float %653, i64 2
+ %1087 = insertelement <4 x float> %1086, float %654, i64 3
+ %1088 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %932, <4 x float> %1087, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1089 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %933, <4 x float> %1088, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !33
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !34
+ %1090 = shufflevector <8 x half> %1038, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1091 = shufflevector <8 x half> %1038, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1092 = shufflevector <8 x half> %1039, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1093 = shufflevector <8 x half> %1039, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1094 = insertelement <4 x float> poison, float %623, i64 0
+ %1095 = insertelement <4 x float> %1094, float %624, i64 1
+ %1096 = insertelement <4 x float> %1095, float %625, i64 2
+ %1097 = insertelement <4 x float> %1096, float %626, i64 3
+ %1098 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1090, <4 x float> %1097, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1099 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1091, <4 x float> %1098, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1100 = insertelement <4 x float> poison, float %627, i64 0
+ %1101 = insertelement <4 x float> %1100, float %628, i64 1
+ %1102 = insertelement <4 x float> %1101, float %629, i64 2
+ %1103 = insertelement <4 x float> %1102, float %630, i64 3
+ %1104 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1090, <4 x float> %1103, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1105 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1091, <4 x float> %1104, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1106 = insertelement <4 x float> poison, float %631, i64 0
+ %1107 = insertelement <4 x float> %1106, float %632, i64 1
+ %1108 = insertelement <4 x float> %1107, float %633, i64 2
+ %1109 = insertelement <4 x float> %1108, float %634, i64 3
+ %1110 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1092, <4 x float> %1109, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1111 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1093, <4 x float> %1110, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1112 = insertelement <4 x float> poison, float %635, i64 0
+ %1113 = insertelement <4 x float> %1112, float %636, i64 1
+ %1114 = insertelement <4 x float> %1113, float %637, i64 2
+ %1115 = insertelement <4 x float> %1114, float %638, i64 3
+ %1116 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1092, <4 x float> %1115, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1117 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1093, <4 x float> %1116, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1118 = insertelement <4 x float> poison, float %607, i64 0
+ %1119 = insertelement <4 x float> %1118, float %608, i64 1
+ %1120 = insertelement <4 x float> %1119, float %609, i64 2
+ %1121 = insertelement <4 x float> %1120, float %610, i64 3
+ %1122 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1090, <4 x float> %1121, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1123 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1091, <4 x float> %1122, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1124 = insertelement <4 x float> poison, float %611, i64 0
+ %1125 = insertelement <4 x float> %1124, float %612, i64 1
+ %1126 = insertelement <4 x float> %1125, float %613, i64 2
+ %1127 = insertelement <4 x float> %1126, float %614, i64 3
+ %1128 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1090, <4 x float> %1127, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1129 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1091, <4 x float> %1128, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1130 = insertelement <4 x float> poison, float %615, i64 0
+ %1131 = insertelement <4 x float> %1130, float %616, i64 1
+ %1132 = insertelement <4 x float> %1131, float %617, i64 2
+ %1133 = insertelement <4 x float> %1132, float %618, i64 3
+ %1134 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1092, <4 x float> %1133, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1135 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1093, <4 x float> %1134, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1136 = insertelement <4 x float> poison, float %619, i64 0
+ %1137 = insertelement <4 x float> %1136, float %620, i64 1
+ %1138 = insertelement <4 x float> %1137, float %621, i64 2
+ %1139 = insertelement <4 x float> %1138, float %622, i64 3
+ %1140 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1092, <4 x float> %1139, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1141 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1093, <4 x float> %1140, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1142 = shufflevector <8 x half> %1040, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1143 = shufflevector <8 x half> %1040, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1144 = shufflevector <8 x half> %1041, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1145 = shufflevector <8 x half> %1041, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1146 = insertelement <4 x float> poison, float %559, i64 0
+ %1147 = insertelement <4 x float> %1146, float %560, i64 1
+ %1148 = insertelement <4 x float> %1147, float %561, i64 2
+ %1149 = insertelement <4 x float> %1148, float %562, i64 3
+ %1150 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1142, <4 x float> %1149, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1151 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1143, <4 x float> %1150, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1152 = insertelement <4 x float> poison, float %563, i64 0
+ %1153 = insertelement <4 x float> %1152, float %564, i64 1
+ %1154 = insertelement <4 x float> %1153, float %565, i64 2
+ %1155 = insertelement <4 x float> %1154, float %566, i64 3
+ %1156 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1142, <4 x float> %1155, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1157 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1143, <4 x float> %1156, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1158 = insertelement <4 x float> poison, float %567, i64 0
+ %1159 = insertelement <4 x float> %1158, float %568, i64 1
+ %1160 = insertelement <4 x float> %1159, float %569, i64 2
+ %1161 = insertelement <4 x float> %1160, float %570, i64 3
+ %1162 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1144, <4 x float> %1161, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1163 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1145, <4 x float> %1162, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1164 = insertelement <4 x float> poison, float %571, i64 0
+ %1165 = insertelement <4 x float> %1164, float %572, i64 1
+ %1166 = insertelement <4 x float> %1165, float %573, i64 2
+ %1167 = insertelement <4 x float> %1166, float %574, i64 3
+ %1168 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1144, <4 x float> %1167, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1169 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1145, <4 x float> %1168, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1170 = insertelement <4 x float> poison, float %543, i64 0
+ %1171 = insertelement <4 x float> %1170, float %544, i64 1
+ %1172 = insertelement <4 x float> %1171, float %545, i64 2
+ %1173 = insertelement <4 x float> %1172, float %546, i64 3
+ %1174 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1142, <4 x float> %1173, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1175 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1143, <4 x float> %1174, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1176 = insertelement <4 x float> poison, float %547, i64 0
+ %1177 = insertelement <4 x float> %1176, float %548, i64 1
+ %1178 = insertelement <4 x float> %1177, float %549, i64 2
+ %1179 = insertelement <4 x float> %1178, float %550, i64 3
+ %1180 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1142, <4 x float> %1179, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1181 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1143, <4 x float> %1180, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1182 = insertelement <4 x float> poison, float %551, i64 0
+ %1183 = insertelement <4 x float> %1182, float %552, i64 1
+ %1184 = insertelement <4 x float> %1183, float %553, i64 2
+ %1185 = insertelement <4 x float> %1184, float %554, i64 3
+ %1186 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1144, <4 x float> %1185, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1187 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1145, <4 x float> %1186, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1188 = insertelement <4 x float> poison, float %555, i64 0
+ %1189 = insertelement <4 x float> %1188, float %556, i64 1
+ %1190 = insertelement <4 x float> %1189, float %557, i64 2
+ %1191 = insertelement <4 x float> %1190, float %558, i64 3
+ %1192 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1144, <4 x float> %1191, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1193 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1145, <4 x float> %1192, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !35
+ %1194 = insertelement <4 x float> poison, float %591, i64 0
+ %1195 = insertelement <4 x float> %1194, float %592, i64 1
+ %1196 = insertelement <4 x float> %1195, float %593, i64 2
+ %1197 = insertelement <4 x float> %1196, float %594, i64 3
+ %1198 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1090, <4 x float> %1197, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1199 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1091, <4 x float> %1198, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1200 = insertelement <4 x float> poison, float %595, i64 0
+ %1201 = insertelement <4 x float> %1200, float %596, i64 1
+ %1202 = insertelement <4 x float> %1201, float %597, i64 2
+ %1203 = insertelement <4 x float> %1202, float %598, i64 3
+ %1204 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1090, <4 x float> %1203, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1205 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1091, <4 x float> %1204, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1206 = insertelement <4 x float> poison, float %599, i64 0
+ %1207 = insertelement <4 x float> %1206, float %600, i64 1
+ %1208 = insertelement <4 x float> %1207, float %601, i64 2
+ %1209 = insertelement <4 x float> %1208, float %602, i64 3
+ %1210 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1092, <4 x float> %1209, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1211 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1093, <4 x float> %1210, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1212 = insertelement <4 x float> poison, float %603, i64 0
+ %1213 = insertelement <4 x float> %1212, float %604, i64 1
+ %1214 = insertelement <4 x float> %1213, float %605, i64 2
+ %1215 = insertelement <4 x float> %1214, float %606, i64 3
+ %1216 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1092, <4 x float> %1215, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1217 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1093, <4 x float> %1216, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !36
+ %1218 = load <8 x half>, ptr addrspace(3) %385, align 16
+ %1219 = load <8 x half>, ptr addrspace(3) %386, align 16
+ %1220 = load <8 x half>, ptr addrspace(3) %388, align 16
+ %1221 = load <8 x half>, ptr addrspace(3) %389, align 16
+ %1222 = insertelement <4 x float> poison, float %575, i64 0
+ %1223 = insertelement <4 x float> %1222, float %576, i64 1
+ %1224 = insertelement <4 x float> %1223, float %577, i64 2
+ %1225 = insertelement <4 x float> %1224, float %578, i64 3
+ %1226 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1090, <4 x float> %1225, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1227 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1091, <4 x float> %1226, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1228 = insertelement <4 x float> poison, float %579, i64 0
+ %1229 = insertelement <4 x float> %1228, float %580, i64 1
+ %1230 = insertelement <4 x float> %1229, float %581, i64 2
+ %1231 = insertelement <4 x float> %1230, float %582, i64 3
+ %1232 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1090, <4 x float> %1231, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1233 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1091, <4 x float> %1232, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1234 = insertelement <4 x float> poison, float %583, i64 0
+ %1235 = insertelement <4 x float> %1234, float %584, i64 1
+ %1236 = insertelement <4 x float> %1235, float %585, i64 2
+ %1237 = insertelement <4 x float> %1236, float %586, i64 3
+ %1238 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1092, <4 x float> %1237, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1239 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1093, <4 x float> %1238, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1240 = insertelement <4 x float> poison, float %587, i64 0
+ %1241 = insertelement <4 x float> %1240, float %588, i64 1
+ %1242 = insertelement <4 x float> %1241, float %589, i64 2
+ %1243 = insertelement <4 x float> %1242, float %590, i64 3
+ %1244 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1092, <4 x float> %1243, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1245 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1093, <4 x float> %1244, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !37
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !38
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !39
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !40
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !41
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !42
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !43
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !44
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !45
+ %1246 = load <8 x half>, ptr addrspace(3) %391, align 16
+ %1247 = load <8 x half>, ptr addrspace(3) %392, align 16
+ %1248 = load <8 x half>, ptr addrspace(3) %395, align 16
+ %1249 = load <8 x half>, ptr addrspace(3) %396, align 16
+ %1250 = insertelement <4 x float> poison, float %527, i64 0
+ %1251 = insertelement <4 x float> %1250, float %528, i64 1
+ %1252 = insertelement <4 x float> %1251, float %529, i64 2
+ %1253 = insertelement <4 x float> %1252, float %530, i64 3
+ %1254 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1142, <4 x float> %1253, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1255 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1143, <4 x float> %1254, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1256 = insertelement <4 x float> poison, float %531, i64 0
+ %1257 = insertelement <4 x float> %1256, float %532, i64 1
+ %1258 = insertelement <4 x float> %1257, float %533, i64 2
+ %1259 = insertelement <4 x float> %1258, float %534, i64 3
+ %1260 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1142, <4 x float> %1259, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1261 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1143, <4 x float> %1260, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1262 = insertelement <4 x float> poison, float %535, i64 0
+ %1263 = insertelement <4 x float> %1262, float %536, i64 1
+ %1264 = insertelement <4 x float> %1263, float %537, i64 2
+ %1265 = insertelement <4 x float> %1264, float %538, i64 3
+ %1266 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1144, <4 x float> %1265, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1267 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1145, <4 x float> %1266, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1268 = insertelement <4 x float> poison, float %539, i64 0
+ %1269 = insertelement <4 x float> %1268, float %540, i64 1
+ %1270 = insertelement <4 x float> %1269, float %541, i64 2
+ %1271 = insertelement <4 x float> %1270, float %542, i64 3
+ %1272 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1144, <4 x float> %1271, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1273 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1145, <4 x float> %1272, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !46
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !47
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !48
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !49
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !50
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !51
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !52
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !53
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !54
+ %1274 = insertelement <4 x float> poison, float %511, i64 0
+ %1275 = insertelement <4 x float> %1274, float %512, i64 1
+ %1276 = insertelement <4 x float> %1275, float %513, i64 2
+ %1277 = insertelement <4 x float> %1276, float %514, i64 3
+ %1278 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1142, <4 x float> %1277, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1279 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1143, <4 x float> %1278, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1280 = insertelement <4 x float> poison, float %515, i64 0
+ %1281 = insertelement <4 x float> %1280, float %516, i64 1
+ %1282 = insertelement <4 x float> %1281, float %517, i64 2
+ %1283 = insertelement <4 x float> %1282, float %518, i64 3
+ %1284 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1142, <4 x float> %1283, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1285 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1143, <4 x float> %1284, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1286 = insertelement <4 x float> poison, float %519, i64 0
+ %1287 = insertelement <4 x float> %1286, float %520, i64 1
+ %1288 = insertelement <4 x float> %1287, float %521, i64 2
+ %1289 = insertelement <4 x float> %1288, float %522, i64 3
+ %1290 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1144, <4 x float> %1289, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1291 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1145, <4 x float> %1290, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1292 = insertelement <4 x float> poison, float %523, i64 0
+ %1293 = insertelement <4 x float> %1292, float %524, i64 1
+ %1294 = insertelement <4 x float> %1293, float %525, i64 2
+ %1295 = insertelement <4 x float> %1294, float %526, i64 3
+ %1296 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1144, <4 x float> %1295, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1297 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1145, <4 x float> %1296, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !55
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !56
+ %1298 = shufflevector <8 x half> %1218, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1299 = shufflevector <8 x half> %1218, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1300 = shufflevector <8 x half> %1219, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1301 = shufflevector <8 x half> %1219, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1302 = shufflevector <8 x half> %1220, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1303 = shufflevector <8 x half> %1220, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1304 = shufflevector <8 x half> %1221, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1305 = shufflevector <8 x half> %1221, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1306 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1298, <4 x float> %879, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1307 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1299, <4 x float> %1306, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1308 = extractelement <4 x float> %1307, i64 0
+ %1309 = extractelement <4 x float> %1307, i64 1
+ %1310 = extractelement <4 x float> %1307, i64 2
+ %1311 = extractelement <4 x float> %1307, i64 3
+ %1312 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1298, <4 x float> %885, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1313 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1299, <4 x float> %1312, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1314 = extractelement <4 x float> %1313, i64 0
+ %1315 = extractelement <4 x float> %1313, i64 1
+ %1316 = extractelement <4 x float> %1313, i64 2
+ %1317 = extractelement <4 x float> %1313, i64 3
+ %1318 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1300, <4 x float> %891, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1319 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1301, <4 x float> %1318, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1320 = extractelement <4 x float> %1319, i64 0
+ %1321 = extractelement <4 x float> %1319, i64 1
+ %1322 = extractelement <4 x float> %1319, i64 2
+ %1323 = extractelement <4 x float> %1319, i64 3
+ %1324 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1300, <4 x float> %897, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1325 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1301, <4 x float> %1324, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1326 = extractelement <4 x float> %1325, i64 0
+ %1327 = extractelement <4 x float> %1325, i64 1
+ %1328 = extractelement <4 x float> %1325, i64 2
+ %1329 = extractelement <4 x float> %1325, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !57
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !58
+ %1330 = load <8 x half>, ptr addrspace(3) %399, align 16
+ %1331 = load <8 x half>, ptr addrspace(3) %400, align 16
+ %1332 = load <8 x half>, ptr addrspace(3) %403, align 16
+ %1333 = load <8 x half>, ptr addrspace(3) %404, align 16
+ %1334 = shufflevector <8 x half> %1246, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1335 = shufflevector <8 x half> %1246, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1336 = shufflevector <8 x half> %1247, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1337 = shufflevector <8 x half> %1247, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1338 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1298, <4 x float> %907, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1339 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1299, <4 x float> %1338, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1340 = extractelement <4 x float> %1339, i64 0
+ %1341 = extractelement <4 x float> %1339, i64 1
+ %1342 = extractelement <4 x float> %1339, i64 2
+ %1343 = extractelement <4 x float> %1339, i64 3
+ %1344 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1298, <4 x float> %913, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1345 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1299, <4 x float> %1344, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1346 = extractelement <4 x float> %1345, i64 0
+ %1347 = extractelement <4 x float> %1345, i64 1
+ %1348 = extractelement <4 x float> %1345, i64 2
+ %1349 = extractelement <4 x float> %1345, i64 3
+ %1350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1300, <4 x float> %919, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1301, <4 x float> %1350, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1352 = extractelement <4 x float> %1351, i64 0
+ %1353 = extractelement <4 x float> %1351, i64 1
+ %1354 = extractelement <4 x float> %1351, i64 2
+ %1355 = extractelement <4 x float> %1351, i64 3
+ %1356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1300, <4 x float> %925, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1301, <4 x float> %1356, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1358 = extractelement <4 x float> %1357, i64 0
+ %1359 = extractelement <4 x float> %1357, i64 1
+ %1360 = extractelement <4 x float> %1357, i64 2
+ %1361 = extractelement <4 x float> %1357, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !59
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !60
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !61
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !62
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !63
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !64
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !65
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !66
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !67
+ %1362 = load <8 x half>, ptr addrspace(3) %407, align 16
+ %1363 = load <8 x half>, ptr addrspace(3) %408, align 16
+ %1364 = load <8 x half>, ptr addrspace(3) %411, align 16
+ %1365 = load <8 x half>, ptr addrspace(3) %412, align 16
+ %1366 = shufflevector <8 x half> %1248, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1367 = shufflevector <8 x half> %1248, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1368 = shufflevector <8 x half> %1249, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1369 = shufflevector <8 x half> %1249, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1370 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1366, <4 x float> %939, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1371 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1367, <4 x float> %1370, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1372 = extractelement <4 x float> %1371, i64 0
+ %1373 = extractelement <4 x float> %1371, i64 1
+ %1374 = extractelement <4 x float> %1371, i64 2
+ %1375 = extractelement <4 x float> %1371, i64 3
+ %1376 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1366, <4 x float> %945, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1377 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1367, <4 x float> %1376, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1378 = extractelement <4 x float> %1377, i64 0
+ %1379 = extractelement <4 x float> %1377, i64 1
+ %1380 = extractelement <4 x float> %1377, i64 2
+ %1381 = extractelement <4 x float> %1377, i64 3
+ %1382 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1368, <4 x float> %951, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1383 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1369, <4 x float> %1382, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1384 = extractelement <4 x float> %1383, i64 0
+ %1385 = extractelement <4 x float> %1383, i64 1
+ %1386 = extractelement <4 x float> %1383, i64 2
+ %1387 = extractelement <4 x float> %1383, i64 3
+ %1388 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1368, <4 x float> %957, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1389 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1369, <4 x float> %1388, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1390 = extractelement <4 x float> %1389, i64 0
+ %1391 = extractelement <4 x float> %1389, i64 1
+ %1392 = extractelement <4 x float> %1389, i64 2
+ %1393 = extractelement <4 x float> %1389, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !68
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !69
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !70
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !71
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !72
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !73
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !74
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !75
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !76
+ %1394 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1366, <4 x float> %963, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1395 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1367, <4 x float> %1394, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1396 = extractelement <4 x float> %1395, i64 0
+ %1397 = extractelement <4 x float> %1395, i64 1
+ %1398 = extractelement <4 x float> %1395, i64 2
+ %1399 = extractelement <4 x float> %1395, i64 3
+ %1400 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1366, <4 x float> %969, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1401 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1367, <4 x float> %1400, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1402 = extractelement <4 x float> %1401, i64 0
+ %1403 = extractelement <4 x float> %1401, i64 1
+ %1404 = extractelement <4 x float> %1401, i64 2
+ %1405 = extractelement <4 x float> %1401, i64 3
+ %1406 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1368, <4 x float> %975, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1407 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1369, <4 x float> %1406, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1408 = extractelement <4 x float> %1407, i64 0
+ %1409 = extractelement <4 x float> %1407, i64 1
+ %1410 = extractelement <4 x float> %1407, i64 2
+ %1411 = extractelement <4 x float> %1407, i64 3
+ %1412 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1368, <4 x float> %981, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1413 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1369, <4 x float> %1412, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1414 = extractelement <4 x float> %1413, i64 0
+ %1415 = extractelement <4 x float> %1413, i64 1
+ %1416 = extractelement <4 x float> %1413, i64 2
+ %1417 = extractelement <4 x float> %1413, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !77
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !78
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %1418 = shufflevector <2 x half> %865, <2 x half> %864, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1419 = shufflevector <2 x half> %863, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1420 = shufflevector <8 x half> %1418, <8 x half> %1419, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1421 = shufflevector <2 x half> %862, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1422 = shufflevector <8 x half> %1420, <8 x half> %1421, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1422, ptr addrspace(3) %199, align 16
+ %1423 = shufflevector <2 x half> %861, <2 x half> %860, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1424 = shufflevector <2 x half> %859, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1425 = shufflevector <8 x half> %1423, <8 x half> %1424, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1426 = shufflevector <2 x half> %858, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1427 = shufflevector <8 x half> %1425, <8 x half> %1426, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1427, ptr addrspace(3) %201, align 16
+ %1428 = shufflevector <2 x half> %857, <2 x half> %856, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1429 = shufflevector <2 x half> %855, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1430 = shufflevector <8 x half> %1428, <8 x half> %1429, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1431 = shufflevector <2 x half> %854, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1432 = shufflevector <8 x half> %1430, <8 x half> %1431, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1432, ptr addrspace(3) %203, align 16
+ %1433 = shufflevector <2 x half> %853, <2 x half> %852, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1434 = shufflevector <2 x half> %851, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1435 = shufflevector <8 x half> %1433, <8 x half> %1434, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1436 = shufflevector <2 x half> %850, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1437 = shufflevector <8 x half> %1435, <8 x half> %1436, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1437, ptr addrspace(3) %205, align 16
+ %1438 = shufflevector <2 x half> %849, <2 x half> %848, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1439 = shufflevector <2 x half> %847, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1440 = shufflevector <8 x half> %1438, <8 x half> %1439, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1441 = shufflevector <2 x half> %846, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1442 = shufflevector <8 x half> %1440, <8 x half> %1441, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1442, ptr addrspace(3) %207, align 16
+ %1443 = shufflevector <2 x half> %845, <2 x half> %844, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1444 = shufflevector <2 x half> %843, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1445 = shufflevector <8 x half> %1443, <8 x half> %1444, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1446 = shufflevector <2 x half> %842, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1447 = shufflevector <8 x half> %1445, <8 x half> %1446, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1447, ptr addrspace(3) %209, align 16
+ %1448 = shufflevector <2 x half> %841, <2 x half> %840, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1449 = shufflevector <2 x half> %839, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1450 = shufflevector <8 x half> %1448, <8 x half> %1449, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1451 = shufflevector <2 x half> %838, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1452 = shufflevector <8 x half> %1450, <8 x half> %1451, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1452, ptr addrspace(3) %211, align 16
+ %1453 = shufflevector <2 x half> %837, <2 x half> %836, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1454 = shufflevector <2 x half> %835, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1455 = shufflevector <8 x half> %1453, <8 x half> %1454, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1456 = shufflevector <2 x half> %834, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1457 = shufflevector <8 x half> %1455, <8 x half> %1456, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1457, ptr addrspace(3) %213, align 16
+ %1458 = getelementptr i8, ptr addrspace(1) %752, i64 128
+ %1459 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %1458, i16 0, i32 2147483646, i32 159744)
+ %1460 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %68, i32 0, i32 0)
+ %1461 = bitcast <4 x i32> %1460 to <8 x half>
+ %1462 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %71, i32 0, i32 0)
+ %1463 = bitcast <4 x i32> %1462 to <8 x half>
+ %1464 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %74, i32 0, i32 0)
+ %1465 = bitcast <4 x i32> %1464 to <8 x half>
+ %1466 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %77, i32 0, i32 0)
+ %1467 = bitcast <4 x i32> %1466 to <8 x half>
+ %1468 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %80, i32 0, i32 0)
+ %1469 = bitcast <4 x i32> %1468 to <8 x half>
+ %1470 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %83, i32 0, i32 0)
+ %1471 = bitcast <4 x i32> %1470 to <8 x half>
+ %1472 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %86, i32 0, i32 0)
+ %1473 = bitcast <4 x i32> %1472 to <8 x half>
+ %1474 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %89, i32 0, i32 0)
+ %1475 = bitcast <4 x i32> %1474 to <8 x half>
+ %1476 = shufflevector <8 x half> %1330, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1477 = shufflevector <8 x half> %1330, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1478 = shufflevector <8 x half> %1331, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1479 = shufflevector <8 x half> %1331, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1480 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1298, <4 x float> %991, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1481 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1299, <4 x float> %1480, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1482 = extractelement <4 x float> %1481, i64 0
+ %1483 = extractelement <4 x float> %1481, i64 1
+ %1484 = extractelement <4 x float> %1481, i64 2
+ %1485 = extractelement <4 x float> %1481, i64 3
+ %1486 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1298, <4 x float> %997, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1487 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1299, <4 x float> %1486, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1488 = extractelement <4 x float> %1487, i64 0
+ %1489 = extractelement <4 x float> %1487, i64 1
+ %1490 = extractelement <4 x float> %1487, i64 2
+ %1491 = extractelement <4 x float> %1487, i64 3
+ %1492 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1300, <4 x float> %1003, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1493 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1301, <4 x float> %1492, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1494 = extractelement <4 x float> %1493, i64 0
+ %1495 = extractelement <4 x float> %1493, i64 1
+ %1496 = extractelement <4 x float> %1493, i64 2
+ %1497 = extractelement <4 x float> %1493, i64 3
+ %1498 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1300, <4 x float> %1009, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1499 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1301, <4 x float> %1498, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1500 = extractelement <4 x float> %1499, i64 0
+ %1501 = extractelement <4 x float> %1499, i64 1
+ %1502 = extractelement <4 x float> %1499, i64 2
+ %1503 = extractelement <4 x float> %1499, i64 3
+ %1504 = shufflevector <8 x half> %1332, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1505 = shufflevector <8 x half> %1332, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1506 = shufflevector <8 x half> %1333, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1507 = shufflevector <8 x half> %1333, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1508 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1298, <4 x float> %1019, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1509 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1299, <4 x float> %1508, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1510 = extractelement <4 x float> %1509, i64 0
+ %1511 = extractelement <4 x float> %1509, i64 1
+ %1512 = extractelement <4 x float> %1509, i64 2
+ %1513 = extractelement <4 x float> %1509, i64 3
+ %1514 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1298, <4 x float> %1025, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1515 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1299, <4 x float> %1514, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1516 = extractelement <4 x float> %1515, i64 0
+ %1517 = extractelement <4 x float> %1515, i64 1
+ %1518 = extractelement <4 x float> %1515, i64 2
+ %1519 = extractelement <4 x float> %1515, i64 3
+ %1520 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1300, <4 x float> %1031, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1521 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1301, <4 x float> %1520, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1522 = extractelement <4 x float> %1521, i64 0
+ %1523 = extractelement <4 x float> %1521, i64 1
+ %1524 = extractelement <4 x float> %1521, i64 2
+ %1525 = extractelement <4 x float> %1521, i64 3
+ %1526 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1300, <4 x float> %1037, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1527 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1301, <4 x float> %1526, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1528 = extractelement <4 x float> %1527, i64 0
+ %1529 = extractelement <4 x float> %1527, i64 1
+ %1530 = extractelement <4 x float> %1527, i64 2
+ %1531 = extractelement <4 x float> %1527, i64 3
+ %1532 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1366, <4 x float> %1047, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1533 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1367, <4 x float> %1532, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1534 = extractelement <4 x float> %1533, i64 0
+ %1535 = extractelement <4 x float> %1533, i64 1
+ %1536 = extractelement <4 x float> %1533, i64 2
+ %1537 = extractelement <4 x float> %1533, i64 3
+ %1538 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1366, <4 x float> %1053, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1539 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1367, <4 x float> %1538, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1540 = extractelement <4 x float> %1539, i64 0
+ %1541 = extractelement <4 x float> %1539, i64 1
+ %1542 = extractelement <4 x float> %1539, i64 2
+ %1543 = extractelement <4 x float> %1539, i64 3
+ %1544 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1368, <4 x float> %1059, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1545 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1369, <4 x float> %1544, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1546 = extractelement <4 x float> %1545, i64 0
+ %1547 = extractelement <4 x float> %1545, i64 1
+ %1548 = extractelement <4 x float> %1545, i64 2
+ %1549 = extractelement <4 x float> %1545, i64 3
+ %1550 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1368, <4 x float> %1065, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1551 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1369, <4 x float> %1550, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1552 = extractelement <4 x float> %1551, i64 0
+ %1553 = extractelement <4 x float> %1551, i64 1
+ %1554 = extractelement <4 x float> %1551, i64 2
+ %1555 = extractelement <4 x float> %1551, i64 3
+ %1556 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1366, <4 x float> %1071, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1557 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1367, <4 x float> %1556, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1558 = extractelement <4 x float> %1557, i64 0
+ %1559 = extractelement <4 x float> %1557, i64 1
+ %1560 = extractelement <4 x float> %1557, i64 2
+ %1561 = extractelement <4 x float> %1557, i64 3
+ %1562 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1366, <4 x float> %1077, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1563 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1367, <4 x float> %1562, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1564 = extractelement <4 x float> %1563, i64 0
+ %1565 = extractelement <4 x float> %1563, i64 1
+ %1566 = extractelement <4 x float> %1563, i64 2
+ %1567 = extractelement <4 x float> %1563, i64 3
+ %1568 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1368, <4 x float> %1083, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1569 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1369, <4 x float> %1568, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1570 = extractelement <4 x float> %1569, i64 0
+ %1571 = extractelement <4 x float> %1569, i64 1
+ %1572 = extractelement <4 x float> %1569, i64 2
+ %1573 = extractelement <4 x float> %1569, i64 3
+ %1574 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1368, <4 x float> %1089, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1575 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1369, <4 x float> %1574, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1576 = extractelement <4 x float> %1575, i64 0
+ %1577 = extractelement <4 x float> %1575, i64 1
+ %1578 = extractelement <4 x float> %1575, i64 2
+ %1579 = extractelement <4 x float> %1575, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !79
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !80
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !81
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !82
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !83
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !84
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !85
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !86
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !87
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !88
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !89
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !90
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !91
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !92
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !93
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !94
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !95
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !96
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !97
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !98
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !99
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !100
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !101
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !102
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !103
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !104
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !105
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !106
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !107
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !108
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !109
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !110
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030), !dbg !111
+ %1580 = shufflevector <2 x half> %833, <2 x half> %832, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1581 = shufflevector <2 x half> %831, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1582 = shufflevector <8 x half> %1580, <8 x half> %1581, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1583 = shufflevector <2 x half> %830, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1584 = shufflevector <8 x half> %1582, <8 x half> %1583, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1584, ptr addrspace(3) %214, align 16
+ %1585 = shufflevector <2 x half> %829, <2 x half> %828, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1586 = shufflevector <2 x half> %827, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1587 = shufflevector <8 x half> %1585, <8 x half> %1586, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1588 = shufflevector <2 x half> %826, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1589 = shufflevector <8 x half> %1587, <8 x half> %1588, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1589, ptr addrspace(3) %215, align 16
+ %1590 = shufflevector <2 x half> %825, <2 x half> %824, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1591 = shufflevector <2 x half> %823, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1592 = shufflevector <8 x half> %1590, <8 x half> %1591, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1593 = shufflevector <2 x half> %822, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1594 = shufflevector <8 x half> %1592, <8 x half> %1593, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1594, ptr addrspace(3) %216, align 16
+ %1595 = shufflevector <2 x half> %821, <2 x half> %820, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1596 = shufflevector <2 x half> %819, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1597 = shufflevector <8 x half> %1595, <8 x half> %1596, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1598 = shufflevector <2 x half> %818, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1599 = shufflevector <8 x half> %1597, <8 x half> %1598, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1599, ptr addrspace(3) %217, align 16
+ %1600 = shufflevector <2 x half> %817, <2 x half> %816, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1601 = shufflevector <2 x half> %815, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1602 = shufflevector <8 x half> %1600, <8 x half> %1601, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1603 = shufflevector <2 x half> %814, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1604 = shufflevector <8 x half> %1602, <8 x half> %1603, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1604, ptr addrspace(3) %218, align 16
+ %1605 = shufflevector <2 x half> %813, <2 x half> %812, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1606 = shufflevector <2 x half> %811, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1607 = shufflevector <8 x half> %1605, <8 x half> %1606, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1608 = shufflevector <2 x half> %810, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1609 = shufflevector <8 x half> %1607, <8 x half> %1608, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1609, ptr addrspace(3) %219, align 16
+ %1610 = shufflevector <2 x half> %809, <2 x half> %808, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1611 = shufflevector <2 x half> %807, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1612 = shufflevector <8 x half> %1610, <8 x half> %1611, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1613 = shufflevector <2 x half> %806, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1614 = shufflevector <8 x half> %1612, <8 x half> %1613, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1614, ptr addrspace(3) %220, align 16
+ %1615 = shufflevector <2 x half> %805, <2 x half> %804, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1616 = shufflevector <2 x half> %803, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1617 = shufflevector <8 x half> %1615, <8 x half> %1616, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1618 = shufflevector <2 x half> %802, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1619 = shufflevector <8 x half> %1617, <8 x half> %1618, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1619, ptr addrspace(3) %221, align 16
+ %1620 = getelementptr i8, ptr addrspace(1) %751, i64 128
+ %1621 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %1620, i16 0, i32 2147483646, i32 159744)
+ %1622 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %140, i32 0, i32 0)
+ %1623 = bitcast <4 x i32> %1622 to <8 x half>
+ %1624 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %143, i32 0, i32 0)
+ %1625 = bitcast <4 x i32> %1624 to <8 x half>
+ %1626 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %146, i32 0, i32 0)
+ %1627 = bitcast <4 x i32> %1626 to <8 x half>
+ %1628 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %149, i32 0, i32 0)
+ %1629 = bitcast <4 x i32> %1628 to <8 x half>
+ %1630 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %152, i32 0, i32 0)
+ %1631 = bitcast <4 x i32> %1630 to <8 x half>
+ %1632 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %155, i32 0, i32 0)
+ %1633 = bitcast <4 x i32> %1632 to <8 x half>
+ %1634 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %158, i32 0, i32 0)
+ %1635 = bitcast <4 x i32> %1634 to <8 x half>
+ %1636 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %161, i32 0, i32 0)
+ %1637 = bitcast <4 x i32> %1636 to <8 x half>
+ %1638 = shufflevector <8 x half> %1362, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1639 = shufflevector <8 x half> %1362, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1640 = shufflevector <8 x half> %1363, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1641 = shufflevector <8 x half> %1363, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1642 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1638, <4 x float> %1099, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1643 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1639, <4 x float> %1642, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1644 = extractelement <4 x float> %1643, i64 0
+ %1645 = extractelement <4 x float> %1643, i64 1
+ %1646 = extractelement <4 x float> %1643, i64 2
+ %1647 = extractelement <4 x float> %1643, i64 3
+ %1648 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1638, <4 x float> %1105, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1649 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1639, <4 x float> %1648, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1650 = extractelement <4 x float> %1649, i64 0
+ %1651 = extractelement <4 x float> %1649, i64 1
+ %1652 = extractelement <4 x float> %1649, i64 2
+ %1653 = extractelement <4 x float> %1649, i64 3
+ %1654 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1640, <4 x float> %1111, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1655 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1641, <4 x float> %1654, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1656 = extractelement <4 x float> %1655, i64 0
+ %1657 = extractelement <4 x float> %1655, i64 1
+ %1658 = extractelement <4 x float> %1655, i64 2
+ %1659 = extractelement <4 x float> %1655, i64 3
+ %1660 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1640, <4 x float> %1117, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1661 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1641, <4 x float> %1660, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1662 = extractelement <4 x float> %1661, i64 0
+ %1663 = extractelement <4 x float> %1661, i64 1
+ %1664 = extractelement <4 x float> %1661, i64 2
+ %1665 = extractelement <4 x float> %1661, i64 3
+ %1666 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1638, <4 x float> %1123, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1667 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1639, <4 x float> %1666, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1668 = extractelement <4 x float> %1667, i64 0
+ %1669 = extractelement <4 x float> %1667, i64 1
+ %1670 = extractelement <4 x float> %1667, i64 2
+ %1671 = extractelement <4 x float> %1667, i64 3
+ %1672 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1638, <4 x float> %1129, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1673 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1639, <4 x float> %1672, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1674 = extractelement <4 x float> %1673, i64 0
+ %1675 = extractelement <4 x float> %1673, i64 1
+ %1676 = extractelement <4 x float> %1673, i64 2
+ %1677 = extractelement <4 x float> %1673, i64 3
+ %1678 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1640, <4 x float> %1135, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1679 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1641, <4 x float> %1678, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1680 = extractelement <4 x float> %1679, i64 0
+ %1681 = extractelement <4 x float> %1679, i64 1
+ %1682 = extractelement <4 x float> %1679, i64 2
+ %1683 = extractelement <4 x float> %1679, i64 3
+ %1684 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1640, <4 x float> %1141, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1685 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1641, <4 x float> %1684, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1686 = extractelement <4 x float> %1685, i64 0
+ %1687 = extractelement <4 x float> %1685, i64 1
+ %1688 = extractelement <4 x float> %1685, i64 2
+ %1689 = extractelement <4 x float> %1685, i64 3
+ %1690 = shufflevector <8 x half> %1364, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1691 = shufflevector <8 x half> %1364, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1692 = shufflevector <8 x half> %1365, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1693 = shufflevector <8 x half> %1365, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1694 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1690, <4 x float> %1151, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1695 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1691, <4 x float> %1694, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1696 = extractelement <4 x float> %1695, i64 0
+ %1697 = extractelement <4 x float> %1695, i64 1
+ %1698 = extractelement <4 x float> %1695, i64 2
+ %1699 = extractelement <4 x float> %1695, i64 3
+ %1700 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1690, <4 x float> %1157, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1701 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1691, <4 x float> %1700, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1702 = extractelement <4 x float> %1701, i64 0
+ %1703 = extractelement <4 x float> %1701, i64 1
+ %1704 = extractelement <4 x float> %1701, i64 2
+ %1705 = extractelement <4 x float> %1701, i64 3
+ %1706 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1692, <4 x float> %1163, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1707 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1693, <4 x float> %1706, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1708 = extractelement <4 x float> %1707, i64 0
+ %1709 = extractelement <4 x float> %1707, i64 1
+ %1710 = extractelement <4 x float> %1707, i64 2
+ %1711 = extractelement <4 x float> %1707, i64 3
+ %1712 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1692, <4 x float> %1169, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1713 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1693, <4 x float> %1712, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1714 = extractelement <4 x float> %1713, i64 0
+ %1715 = extractelement <4 x float> %1713, i64 1
+ %1716 = extractelement <4 x float> %1713, i64 2
+ %1717 = extractelement <4 x float> %1713, i64 3
+ %1718 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1690, <4 x float> %1175, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1719 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1691, <4 x float> %1718, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1720 = extractelement <4 x float> %1719, i64 0
+ %1721 = extractelement <4 x float> %1719, i64 1
+ %1722 = extractelement <4 x float> %1719, i64 2
+ %1723 = extractelement <4 x float> %1719, i64 3
+ %1724 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1690, <4 x float> %1181, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1725 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1691, <4 x float> %1724, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1726 = extractelement <4 x float> %1725, i64 0
+ %1727 = extractelement <4 x float> %1725, i64 1
+ %1728 = extractelement <4 x float> %1725, i64 2
+ %1729 = extractelement <4 x float> %1725, i64 3
+ %1730 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1692, <4 x float> %1187, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1731 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1693, <4 x float> %1730, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1732 = extractelement <4 x float> %1731, i64 0
+ %1733 = extractelement <4 x float> %1731, i64 1
+ %1734 = extractelement <4 x float> %1731, i64 2
+ %1735 = extractelement <4 x float> %1731, i64 3
+ %1736 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1692, <4 x float> %1193, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1737 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1693, <4 x float> %1736, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1738 = extractelement <4 x float> %1737, i64 0
+ %1739 = extractelement <4 x float> %1737, i64 1
+ %1740 = extractelement <4 x float> %1737, i64 2
+ %1741 = extractelement <4 x float> %1737, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !112
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !113
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !114
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !115
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !116
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !117
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !118
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !119
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !120
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !121
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !122
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !123
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !124
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !125
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !126
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !127
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !128
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !129
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !130
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !131
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !132
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !133
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !134
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !135
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !136
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !137
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !138
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !139
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !140
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0), !dbg !141
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !142
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0), !dbg !143
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !144
+ %1742 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1638, <4 x float> %1199, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1743 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1639, <4 x float> %1742, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1744 = extractelement <4 x float> %1743, i64 0
+ %1745 = extractelement <4 x float> %1743, i64 1
+ %1746 = extractelement <4 x float> %1743, i64 2
+ %1747 = extractelement <4 x float> %1743, i64 3
+ %1748 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1638, <4 x float> %1205, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1749 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1639, <4 x float> %1748, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1750 = extractelement <4 x float> %1749, i64 0
+ %1751 = extractelement <4 x float> %1749, i64 1
+ %1752 = extractelement <4 x float> %1749, i64 2
+ %1753 = extractelement <4 x float> %1749, i64 3
+ %1754 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1640, <4 x float> %1211, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1755 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1641, <4 x float> %1754, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1756 = extractelement <4 x float> %1755, i64 0
+ %1757 = extractelement <4 x float> %1755, i64 1
+ %1758 = extractelement <4 x float> %1755, i64 2
+ %1759 = extractelement <4 x float> %1755, i64 3
+ %1760 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1640, <4 x float> %1217, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1761 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1641, <4 x float> %1760, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1762 = extractelement <4 x float> %1761, i64 0
+ %1763 = extractelement <4 x float> %1761, i64 1
+ %1764 = extractelement <4 x float> %1761, i64 2
+ %1765 = extractelement <4 x float> %1761, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !145
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !146
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %1766 = load <8 x half>, ptr addrspace(3) %233, align 16
+ %1767 = load <8 x half>, ptr addrspace(3) %235, align 16
+ %1768 = load <8 x half>, ptr addrspace(3) %243, align 16
+ %1769 = load <8 x half>, ptr addrspace(3) %245, align 16
+ %1770 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1638, <4 x float> %1227, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1771 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1639, <4 x float> %1770, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1772 = extractelement <4 x float> %1771, i64 0
+ %1773 = extractelement <4 x float> %1771, i64 1
+ %1774 = extractelement <4 x float> %1771, i64 2
+ %1775 = extractelement <4 x float> %1771, i64 3
+ %1776 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1638, <4 x float> %1233, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1777 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1639, <4 x float> %1776, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1778 = extractelement <4 x float> %1777, i64 0
+ %1779 = extractelement <4 x float> %1777, i64 1
+ %1780 = extractelement <4 x float> %1777, i64 2
+ %1781 = extractelement <4 x float> %1777, i64 3
+ %1782 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1640, <4 x float> %1239, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1783 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1641, <4 x float> %1782, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1784 = extractelement <4 x float> %1783, i64 0
+ %1785 = extractelement <4 x float> %1783, i64 1
+ %1786 = extractelement <4 x float> %1783, i64 2
+ %1787 = extractelement <4 x float> %1783, i64 3
+ %1788 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1640, <4 x float> %1245, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1789 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1641, <4 x float> %1788, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1790 = extractelement <4 x float> %1789, i64 0
+ %1791 = extractelement <4 x float> %1789, i64 1
+ %1792 = extractelement <4 x float> %1789, i64 2
+ %1793 = extractelement <4 x float> %1789, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !147
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !148
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !149
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !150
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !151
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !152
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !153
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !154
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !155
+ %1794 = load <8 x half>, ptr addrspace(3) %251, align 16
+ %1795 = load <8 x half>, ptr addrspace(3) %253, align 16
+ %1796 = load <8 x half>, ptr addrspace(3) %258, align 16
+ %1797 = load <8 x half>, ptr addrspace(3) %260, align 16
+ %1798 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1690, <4 x float> %1255, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1799 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1691, <4 x float> %1798, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1800 = extractelement <4 x float> %1799, i64 0
+ %1801 = extractelement <4 x float> %1799, i64 1
+ %1802 = extractelement <4 x float> %1799, i64 2
+ %1803 = extractelement <4 x float> %1799, i64 3
+ %1804 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1690, <4 x float> %1261, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1805 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1691, <4 x float> %1804, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1806 = extractelement <4 x float> %1805, i64 0
+ %1807 = extractelement <4 x float> %1805, i64 1
+ %1808 = extractelement <4 x float> %1805, i64 2
+ %1809 = extractelement <4 x float> %1805, i64 3
+ %1810 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1692, <4 x float> %1267, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1811 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1693, <4 x float> %1810, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1812 = extractelement <4 x float> %1811, i64 0
+ %1813 = extractelement <4 x float> %1811, i64 1
+ %1814 = extractelement <4 x float> %1811, i64 2
+ %1815 = extractelement <4 x float> %1811, i64 3
+ %1816 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1692, <4 x float> %1273, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1817 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1693, <4 x float> %1816, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1818 = extractelement <4 x float> %1817, i64 0
+ %1819 = extractelement <4 x float> %1817, i64 1
+ %1820 = extractelement <4 x float> %1817, i64 2
+ %1821 = extractelement <4 x float> %1817, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !156
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !157
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !158
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !159
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !160
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !161
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 2, i32 0), !dbg !162
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0), !dbg !163
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !164
+ %1822 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1690, <4 x float> %1279, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1823 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1691, <4 x float> %1822, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1824 = extractelement <4 x float> %1823, i64 0
+ %1825 = extractelement <4 x float> %1823, i64 1
+ %1826 = extractelement <4 x float> %1823, i64 2
+ %1827 = extractelement <4 x float> %1823, i64 3
+ %1828 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1690, <4 x float> %1285, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1829 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1691, <4 x float> %1828, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1830 = extractelement <4 x float> %1829, i64 0
+ %1831 = extractelement <4 x float> %1829, i64 1
+ %1832 = extractelement <4 x float> %1829, i64 2
+ %1833 = extractelement <4 x float> %1829, i64 3
+ %1834 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1692, <4 x float> %1291, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1835 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1693, <4 x float> %1834, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1836 = extractelement <4 x float> %1835, i64 0
+ %1837 = extractelement <4 x float> %1835, i64 1
+ %1838 = extractelement <4 x float> %1835, i64 2
+ %1839 = extractelement <4 x float> %1835, i64 3
+ %1840 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1692, <4 x float> %1297, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1841 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1693, <4 x float> %1840, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %1842 = extractelement <4 x float> %1841, i64 0
+ %1843 = extractelement <4 x float> %1841, i64 1
+ %1844 = extractelement <4 x float> %1841, i64 2
+ %1845 = extractelement <4 x float> %1841, i64 3
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 8, i32 0), !dbg !165
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !166
+ %1846 = add nuw nsw i32 %769, 1
+ %exitcond.not = icmp eq i32 %769, %413
+ %1847 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1848 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1849 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1850 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1851 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1852 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1853 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1854 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1855 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1856 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1857 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1858 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1859 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1860 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1861 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1862 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1863 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1864 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1865 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1866 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1867 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1868 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1869 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1870 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1871 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1872 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1873 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1874 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1875 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1876 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1877 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1878 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1879 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1880 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1881 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1882 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1883 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1884 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1885 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1886 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1887 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1888 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1889 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1890 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1891 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1892 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1893 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1894 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1895 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1896 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1897 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1898 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1899 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1900 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1901 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1902 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1903 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1904 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1905 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1906 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1907 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1908 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1909 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1910 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1911 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1912 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1913 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1914 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1915 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1916 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1917 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1918 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1919 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1920 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1921 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1922 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1923 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1924 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1925 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1926 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1927 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1928 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1929 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1930 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1931 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1932 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1933 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1934 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1935 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1936 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1937 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1938 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1939 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1940 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1941 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1942 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ br i1 %exitcond.not, label %._crit_edge, label %510
+
+._crit_edge: ; preds = %510, %.._crit_edge_crit_edge
+ %.pre-phi1068 = phi i32 [ %.pre1067, %.._crit_edge_crit_edge ], [ %410, %510 ]
+ %.pre-phi1066 = phi i32 [ %.pre1065, %.._crit_edge_crit_edge ], [ %409, %510 ]
+ %.pre-phi1064 = phi i32 [ %.pre1063, %.._crit_edge_crit_edge ], [ %406, %510 ]
+ %.pre-phi1062 = phi i32 [ %.pre1061, %.._crit_edge_crit_edge ], [ %405, %510 ]
+ %.pre-phi1060 = phi i32 [ %.pre1059, %.._crit_edge_crit_edge ], [ %402, %510 ]
+ %.pre-phi1058 = phi i32 [ %.pre1057, %.._crit_edge_crit_edge ], [ %401, %510 ]
+ %.pre-phi1056 = phi i32 [ %.pre1055, %.._crit_edge_crit_edge ], [ %398, %510 ]
+ %.pre-phi1054 = phi i32 [ %.pre1053, %.._crit_edge_crit_edge ], [ %397, %510 ]
+ %.pre-phi1052 = phi i32 [ %.pre1051, %.._crit_edge_crit_edge ], [ %394, %510 ]
+ %.pre-phi1050 = phi i32 [ %.pre1049, %.._crit_edge_crit_edge ], [ %393, %510 ]
+ %.pre-phi1048 = phi i32 [ %.pre1047, %.._crit_edge_crit_edge ], [ %390, %510 ]
+ %.pre-phi1046 = phi i32 [ %.pre1045, %.._crit_edge_crit_edge ], [ %384, %510 ]
+ %.pre-phi1044 = phi i32 [ %.pre1043, %.._crit_edge_crit_edge ], [ %383, %510 ]
+ %.pre-phi1042 = phi i32 [ %.pre1041, %.._crit_edge_crit_edge ], [ %387, %510 ]
+ %.pre-phi1034 = phi i32 [ %.pre1033, %.._crit_edge_crit_edge ], [ %377, %510 ]
+ %.pre-phi1030 = phi i32 [ %.pre1029, %.._crit_edge_crit_edge ], [ %375, %510 ]
+ %.pre-phi1026 = phi i32 [ %.pre1025, %.._crit_edge_crit_edge ], [ %371, %510 ]
+ %.pre-phi1022 = phi i32 [ %.pre1021, %.._crit_edge_crit_edge ], [ %369, %510 ]
+ %.pre-phi1018 = phi i32 [ %.pre1017, %.._crit_edge_crit_edge ], [ %365, %510 ]
+ %.pre-phi1014 = phi i32 [ %.pre1013, %.._crit_edge_crit_edge ], [ %361, %510 ]
+ %1943 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1308, %510 ], !dbg !167
+ %1944 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1309, %510 ], !dbg !167
+ %1945 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1310, %510 ], !dbg !167
+ %1946 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1311, %510 ], !dbg !167
+ %1947 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1314, %510 ], !dbg !167
+ %1948 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1315, %510 ], !dbg !167
+ %1949 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1316, %510 ], !dbg !167
+ %1950 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1317, %510 ], !dbg !167
+ %1951 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1320, %510 ], !dbg !167
+ %1952 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1321, %510 ], !dbg !167
+ %1953 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1322, %510 ], !dbg !167
+ %1954 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1323, %510 ], !dbg !167
+ %1955 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1326, %510 ], !dbg !167
+ %1956 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1327, %510 ], !dbg !167
+ %1957 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1328, %510 ], !dbg !167
+ %1958 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1329, %510 ], !dbg !167
+ %1959 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1340, %510 ], !dbg !168
+ %1960 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1341, %510 ], !dbg !168
+ %1961 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1342, %510 ], !dbg !168
+ %1962 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1343, %510 ], !dbg !168
+ %1963 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1346, %510 ], !dbg !168
+ %1964 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1347, %510 ], !dbg !168
+ %1965 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1348, %510 ], !dbg !168
+ %1966 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1349, %510 ], !dbg !168
+ %1967 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1352, %510 ], !dbg !168
+ %1968 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1353, %510 ], !dbg !168
+ %1969 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1354, %510 ], !dbg !168
+ %1970 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1355, %510 ], !dbg !168
+ %1971 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1358, %510 ], !dbg !168
+ %1972 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1359, %510 ], !dbg !168
+ %1973 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1360, %510 ], !dbg !168
+ %1974 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1361, %510 ], !dbg !168
+ %1975 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1482, %510 ], !dbg !169
+ %1976 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1483, %510 ], !dbg !169
+ %1977 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1484, %510 ], !dbg !169
+ %1978 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1485, %510 ], !dbg !169
+ %1979 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1488, %510 ], !dbg !169
+ %1980 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1489, %510 ], !dbg !169
+ %1981 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1490, %510 ], !dbg !169
+ %1982 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1491, %510 ], !dbg !169
+ %1983 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1494, %510 ], !dbg !169
+ %1984 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1495, %510 ], !dbg !169
+ %1985 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1496, %510 ], !dbg !169
+ %1986 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1497, %510 ], !dbg !169
+ %1987 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1500, %510 ], !dbg !169
+ %1988 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1501, %510 ], !dbg !169
+ %1989 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1502, %510 ], !dbg !169
+ %1990 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1503, %510 ], !dbg !169
+ %1991 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1510, %510 ], !dbg !170
+ %1992 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1511, %510 ], !dbg !170
+ %1993 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1512, %510 ], !dbg !170
+ %1994 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1513, %510 ], !dbg !170
+ %1995 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1516, %510 ], !dbg !170
+ %1996 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1517, %510 ], !dbg !170
+ %1997 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1518, %510 ], !dbg !170
+ %1998 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1519, %510 ], !dbg !170
+ %1999 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1522, %510 ], !dbg !170
+ %2000 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1523, %510 ], !dbg !170
+ %2001 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1524, %510 ], !dbg !170
+ %2002 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1525, %510 ], !dbg !170
+ %2003 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1528, %510 ], !dbg !170
+ %2004 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1529, %510 ], !dbg !170
+ %2005 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1530, %510 ], !dbg !170
+ %2006 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1531, %510 ], !dbg !170
+ %2007 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1372, %510 ], !dbg !171
+ %2008 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1373, %510 ], !dbg !171
+ %2009 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1374, %510 ], !dbg !171
+ %2010 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1375, %510 ], !dbg !171
+ %2011 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1378, %510 ], !dbg !171
+ %2012 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1379, %510 ], !dbg !171
+ %2013 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1380, %510 ], !dbg !171
+ %2014 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1381, %510 ], !dbg !171
+ %2015 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1384, %510 ], !dbg !171
+ %2016 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1385, %510 ], !dbg !171
+ %2017 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1386, %510 ], !dbg !171
+ %2018 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1387, %510 ], !dbg !171
+ %2019 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1390, %510 ], !dbg !171
+ %2020 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1391, %510 ], !dbg !171
+ %2021 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1392, %510 ], !dbg !171
+ %2022 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1393, %510 ], !dbg !171
+ %2023 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1396, %510 ], !dbg !172
+ %2024 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1397, %510 ], !dbg !172
+ %2025 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1398, %510 ], !dbg !172
+ %2026 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1399, %510 ], !dbg !172
+ %2027 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1402, %510 ], !dbg !172
+ %2028 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1403, %510 ], !dbg !172
+ %2029 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1404, %510 ], !dbg !172
+ %2030 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1405, %510 ], !dbg !172
+ %2031 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1408, %510 ], !dbg !172
+ %2032 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1409, %510 ], !dbg !172
+ %2033 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1410, %510 ], !dbg !172
+ %2034 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1411, %510 ], !dbg !172
+ %2035 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1414, %510 ], !dbg !172
+ %2036 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1415, %510 ], !dbg !172
+ %2037 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1416, %510 ], !dbg !172
+ %2038 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1417, %510 ], !dbg !172
+ %2039 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1534, %510 ], !dbg !173
+ %2040 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1535, %510 ], !dbg !173
+ %2041 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1536, %510 ], !dbg !173
+ %2042 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1537, %510 ], !dbg !173
+ %2043 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1540, %510 ], !dbg !173
+ %2044 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1541, %510 ], !dbg !173
+ %2045 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1542, %510 ], !dbg !173
+ %2046 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1543, %510 ], !dbg !173
+ %2047 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1546, %510 ], !dbg !173
+ %2048 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1547, %510 ], !dbg !173
+ %2049 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1548, %510 ], !dbg !173
+ %2050 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1549, %510 ], !dbg !173
+ %2051 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1552, %510 ], !dbg !173
+ %2052 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1553, %510 ], !dbg !173
+ %2053 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1554, %510 ], !dbg !173
+ %2054 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1555, %510 ], !dbg !173
+ %2055 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1558, %510 ], !dbg !174
+ %2056 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1559, %510 ], !dbg !174
+ %2057 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1560, %510 ], !dbg !174
+ %2058 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1561, %510 ], !dbg !174
+ %2059 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1564, %510 ], !dbg !174
+ %2060 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1565, %510 ], !dbg !174
+ %2061 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1566, %510 ], !dbg !174
+ %2062 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1567, %510 ], !dbg !174
+ %2063 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1570, %510 ], !dbg !174
+ %2064 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1571, %510 ], !dbg !174
+ %2065 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1572, %510 ], !dbg !174
+ %2066 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1573, %510 ], !dbg !174
+ %2067 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1576, %510 ], !dbg !174
+ %2068 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1577, %510 ], !dbg !174
+ %2069 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1578, %510 ], !dbg !174
+ %2070 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1579, %510 ], !dbg !174
+ %2071 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1644, %510 ], !dbg !175
+ %2072 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1645, %510 ], !dbg !175
+ %2073 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1646, %510 ], !dbg !175
+ %2074 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1647, %510 ], !dbg !175
+ %2075 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1650, %510 ], !dbg !175
+ %2076 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1651, %510 ], !dbg !175
+ %2077 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1652, %510 ], !dbg !175
+ %2078 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1653, %510 ], !dbg !175
+ %2079 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1656, %510 ], !dbg !175
+ %2080 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1657, %510 ], !dbg !175
+ %2081 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1658, %510 ], !dbg !175
+ %2082 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1659, %510 ], !dbg !175
+ %2083 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1662, %510 ], !dbg !175
+ %2084 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1663, %510 ], !dbg !175
+ %2085 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1664, %510 ], !dbg !175
+ %2086 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1665, %510 ], !dbg !175
+ %2087 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1668, %510 ], !dbg !176
+ %2088 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1669, %510 ], !dbg !176
+ %2089 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1670, %510 ], !dbg !176
+ %2090 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1671, %510 ], !dbg !176
+ %2091 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1674, %510 ], !dbg !176
+ %2092 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1675, %510 ], !dbg !176
+ %2093 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1676, %510 ], !dbg !176
+ %2094 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1677, %510 ], !dbg !176
+ %2095 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1680, %510 ], !dbg !176
+ %2096 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1681, %510 ], !dbg !176
+ %2097 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1682, %510 ], !dbg !176
+ %2098 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1683, %510 ], !dbg !176
+ %2099 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1686, %510 ], !dbg !176
+ %2100 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1687, %510 ], !dbg !176
+ %2101 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1688, %510 ], !dbg !176
+ %2102 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1689, %510 ], !dbg !176
+ %2103 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1744, %510 ], !dbg !177
+ %2104 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1745, %510 ], !dbg !177
+ %2105 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1746, %510 ], !dbg !177
+ %2106 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1747, %510 ], !dbg !177
+ %2107 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1750, %510 ], !dbg !177
+ %2108 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1751, %510 ], !dbg !177
+ %2109 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1752, %510 ], !dbg !177
+ %2110 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1753, %510 ], !dbg !177
+ %2111 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1756, %510 ], !dbg !177
+ %2112 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1757, %510 ], !dbg !177
+ %2113 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1758, %510 ], !dbg !177
+ %2114 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1759, %510 ], !dbg !177
+ %2115 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1762, %510 ], !dbg !177
+ %2116 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1763, %510 ], !dbg !177
+ %2117 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1764, %510 ], !dbg !177
+ %2118 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1765, %510 ], !dbg !177
+ %2119 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1772, %510 ], !dbg !178
+ %2120 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1773, %510 ], !dbg !178
+ %2121 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1774, %510 ], !dbg !178
+ %2122 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1775, %510 ], !dbg !178
+ %2123 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1778, %510 ], !dbg !178
+ %2124 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1779, %510 ], !dbg !178
+ %2125 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1780, %510 ], !dbg !178
+ %2126 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1781, %510 ], !dbg !178
+ %2127 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1784, %510 ], !dbg !178
+ %2128 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1785, %510 ], !dbg !178
+ %2129 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1786, %510 ], !dbg !178
+ %2130 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1787, %510 ], !dbg !178
+ %2131 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1790, %510 ], !dbg !178
+ %2132 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1791, %510 ], !dbg !178
+ %2133 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1792, %510 ], !dbg !178
+ %2134 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1793, %510 ], !dbg !178
+ %2135 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1696, %510 ], !dbg !179
+ %2136 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1697, %510 ], !dbg !179
+ %2137 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1698, %510 ], !dbg !179
+ %2138 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1699, %510 ], !dbg !179
+ %2139 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1702, %510 ], !dbg !179
+ %2140 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1703, %510 ], !dbg !179
+ %2141 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1704, %510 ], !dbg !179
+ %2142 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1705, %510 ], !dbg !179
+ %2143 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1708, %510 ], !dbg !179
+ %2144 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1709, %510 ], !dbg !179
+ %2145 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1710, %510 ], !dbg !179
+ %2146 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1711, %510 ], !dbg !179
+ %2147 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1714, %510 ], !dbg !179
+ %2148 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1715, %510 ], !dbg !179
+ %2149 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1716, %510 ], !dbg !179
+ %2150 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1717, %510 ], !dbg !179
+ %2151 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1720, %510 ], !dbg !180
+ %2152 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1721, %510 ], !dbg !180
+ %2153 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1722, %510 ], !dbg !180
+ %2154 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1723, %510 ], !dbg !180
+ %2155 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1726, %510 ], !dbg !180
+ %2156 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1727, %510 ], !dbg !180
+ %2157 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1728, %510 ], !dbg !180
+ %2158 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1729, %510 ], !dbg !180
+ %2159 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1732, %510 ], !dbg !180
+ %2160 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1733, %510 ], !dbg !180
+ %2161 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1734, %510 ], !dbg !180
+ %2162 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1735, %510 ], !dbg !180
+ %2163 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1738, %510 ], !dbg !180
+ %2164 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1739, %510 ], !dbg !180
+ %2165 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1740, %510 ], !dbg !180
+ %2166 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1741, %510 ], !dbg !180
+ %2167 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1800, %510 ], !dbg !181
+ %2168 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1801, %510 ], !dbg !181
+ %2169 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1802, %510 ], !dbg !181
+ %2170 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1803, %510 ], !dbg !181
+ %2171 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1806, %510 ], !dbg !181
+ %2172 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1807, %510 ], !dbg !181
+ %2173 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1808, %510 ], !dbg !181
+ %2174 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1809, %510 ], !dbg !181
+ %2175 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1812, %510 ], !dbg !181
+ %2176 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1813, %510 ], !dbg !181
+ %2177 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1814, %510 ], !dbg !181
+ %2178 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1815, %510 ], !dbg !181
+ %2179 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1818, %510 ], !dbg !181
+ %2180 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1819, %510 ], !dbg !181
+ %2181 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1820, %510 ], !dbg !181
+ %2182 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1821, %510 ], !dbg !181
+ %2183 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1824, %510 ], !dbg !182
+ %2184 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1825, %510 ], !dbg !182
+ %2185 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1826, %510 ], !dbg !182
+ %2186 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1827, %510 ], !dbg !182
+ %2187 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1830, %510 ], !dbg !182
+ %2188 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1831, %510 ], !dbg !182
+ %2189 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1832, %510 ], !dbg !182
+ %2190 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1833, %510 ], !dbg !182
+ %2191 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1836, %510 ], !dbg !182
+ %2192 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1837, %510 ], !dbg !182
+ %2193 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1838, %510 ], !dbg !182
+ %2194 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1839, %510 ], !dbg !182
+ %2195 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1842, %510 ], !dbg !182
+ %2196 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1843, %510 ], !dbg !182
+ %2197 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1844, %510 ], !dbg !182
+ %2198 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1845, %510 ], !dbg !182
+ %2199 = phi <2 x half> [ %263, %.._crit_edge_crit_edge ], [ %1847, %510 ]
+ %2200 = phi <2 x half> [ %264, %.._crit_edge_crit_edge ], [ %1848, %510 ]
+ %2201 = phi <2 x half> [ %265, %.._crit_edge_crit_edge ], [ %1849, %510 ]
+ %2202 = phi <2 x half> [ %266, %.._crit_edge_crit_edge ], [ %1850, %510 ]
+ %2203 = phi <2 x half> [ %267, %.._crit_edge_crit_edge ], [ %1851, %510 ]
+ %2204 = phi <2 x half> [ %268, %.._crit_edge_crit_edge ], [ %1852, %510 ]
+ %2205 = phi <2 x half> [ %269, %.._crit_edge_crit_edge ], [ %1853, %510 ]
+ %2206 = phi <2 x half> [ %270, %.._crit_edge_crit_edge ], [ %1854, %510 ]
+ %2207 = phi <2 x half> [ %271, %.._crit_edge_crit_edge ], [ %1855, %510 ]
+ %2208 = phi <2 x half> [ %272, %.._crit_edge_crit_edge ], [ %1856, %510 ]
+ %2209 = phi <2 x half> [ %273, %.._crit_edge_crit_edge ], [ %1857, %510 ]
+ %2210 = phi <2 x half> [ %274, %.._crit_edge_crit_edge ], [ %1858, %510 ]
+ %2211 = phi <2 x half> [ %275, %.._crit_edge_crit_edge ], [ %1859, %510 ]
+ %2212 = phi <2 x half> [ %276, %.._crit_edge_crit_edge ], [ %1860, %510 ]
+ %2213 = phi <2 x half> [ %277, %.._crit_edge_crit_edge ], [ %1861, %510 ]
+ %2214 = phi <2 x half> [ %278, %.._crit_edge_crit_edge ], [ %1862, %510 ]
+ %2215 = phi <2 x half> [ %279, %.._crit_edge_crit_edge ], [ %1863, %510 ]
+ %2216 = phi <2 x half> [ %280, %.._crit_edge_crit_edge ], [ %1864, %510 ]
+ %2217 = phi <2 x half> [ %281, %.._crit_edge_crit_edge ], [ %1865, %510 ]
+ %2218 = phi <2 x half> [ %282, %.._crit_edge_crit_edge ], [ %1866, %510 ]
+ %2219 = phi <2 x half> [ %283, %.._crit_edge_crit_edge ], [ %1867, %510 ]
+ %2220 = phi <2 x half> [ %284, %.._crit_edge_crit_edge ], [ %1868, %510 ]
+ %2221 = phi <2 x half> [ %285, %.._crit_edge_crit_edge ], [ %1869, %510 ]
+ %2222 = phi <2 x half> [ %286, %.._crit_edge_crit_edge ], [ %1870, %510 ]
+ %2223 = phi <2 x half> [ %287, %.._crit_edge_crit_edge ], [ %1871, %510 ]
+ %2224 = phi <2 x half> [ %288, %.._crit_edge_crit_edge ], [ %1872, %510 ]
+ %2225 = phi <2 x half> [ %289, %.._crit_edge_crit_edge ], [ %1873, %510 ]
+ %2226 = phi <2 x half> [ %290, %.._crit_edge_crit_edge ], [ %1874, %510 ]
+ %2227 = phi <2 x half> [ %291, %.._crit_edge_crit_edge ], [ %1875, %510 ]
+ %2228 = phi <2 x half> [ %292, %.._crit_edge_crit_edge ], [ %1876, %510 ]
+ %2229 = phi <2 x half> [ %293, %.._crit_edge_crit_edge ], [ %1877, %510 ]
+ %2230 = phi <2 x half> [ %294, %.._crit_edge_crit_edge ], [ %1878, %510 ]
+ %2231 = phi <2 x half> [ %295, %.._crit_edge_crit_edge ], [ %1879, %510 ]
+ %2232 = phi <2 x half> [ %296, %.._crit_edge_crit_edge ], [ %1880, %510 ]
+ %2233 = phi <2 x half> [ %297, %.._crit_edge_crit_edge ], [ %1881, %510 ]
+ %2234 = phi <2 x half> [ %298, %.._crit_edge_crit_edge ], [ %1882, %510 ]
+ %2235 = phi <2 x half> [ %299, %.._crit_edge_crit_edge ], [ %1883, %510 ]
+ %2236 = phi <2 x half> [ %300, %.._crit_edge_crit_edge ], [ %1884, %510 ]
+ %2237 = phi <2 x half> [ %301, %.._crit_edge_crit_edge ], [ %1885, %510 ]
+ %2238 = phi <2 x half> [ %302, %.._crit_edge_crit_edge ], [ %1886, %510 ]
+ %2239 = phi <2 x half> [ %303, %.._crit_edge_crit_edge ], [ %1887, %510 ]
+ %2240 = phi <2 x half> [ %304, %.._crit_edge_crit_edge ], [ %1888, %510 ]
+ %2241 = phi <2 x half> [ %305, %.._crit_edge_crit_edge ], [ %1889, %510 ]
+ %2242 = phi <2 x half> [ %306, %.._crit_edge_crit_edge ], [ %1890, %510 ]
+ %2243 = phi <2 x half> [ %307, %.._crit_edge_crit_edge ], [ %1891, %510 ]
+ %2244 = phi <2 x half> [ %308, %.._crit_edge_crit_edge ], [ %1892, %510 ]
+ %2245 = phi <2 x half> [ %309, %.._crit_edge_crit_edge ], [ %1893, %510 ]
+ %2246 = phi <2 x half> [ %310, %.._crit_edge_crit_edge ], [ %1894, %510 ]
+ %2247 = phi <2 x half> [ %311, %.._crit_edge_crit_edge ], [ %1895, %510 ]
+ %2248 = phi <2 x half> [ %312, %.._crit_edge_crit_edge ], [ %1896, %510 ]
+ %2249 = phi <2 x half> [ %313, %.._crit_edge_crit_edge ], [ %1897, %510 ]
+ %2250 = phi <2 x half> [ %314, %.._crit_edge_crit_edge ], [ %1898, %510 ]
+ %2251 = phi <2 x half> [ %315, %.._crit_edge_crit_edge ], [ %1899, %510 ]
+ %2252 = phi <2 x half> [ %316, %.._crit_edge_crit_edge ], [ %1900, %510 ]
+ %2253 = phi <2 x half> [ %317, %.._crit_edge_crit_edge ], [ %1901, %510 ]
+ %2254 = phi <2 x half> [ %318, %.._crit_edge_crit_edge ], [ %1902, %510 ]
+ %2255 = phi <2 x half> [ %319, %.._crit_edge_crit_edge ], [ %1903, %510 ]
+ %2256 = phi <2 x half> [ %320, %.._crit_edge_crit_edge ], [ %1904, %510 ]
+ %2257 = phi <2 x half> [ %321, %.._crit_edge_crit_edge ], [ %1905, %510 ]
+ %2258 = phi <2 x half> [ %322, %.._crit_edge_crit_edge ], [ %1906, %510 ]
+ %2259 = phi <2 x half> [ %323, %.._crit_edge_crit_edge ], [ %1907, %510 ]
+ %2260 = phi <2 x half> [ %324, %.._crit_edge_crit_edge ], [ %1908, %510 ]
+ %2261 = phi <2 x half> [ %325, %.._crit_edge_crit_edge ], [ %1909, %510 ]
+ %2262 = phi <2 x half> [ %326, %.._crit_edge_crit_edge ], [ %1910, %510 ]
+ %2263 = phi <2 x half> [ %327, %.._crit_edge_crit_edge ], [ %1911, %510 ]
+ %2264 = phi <2 x half> [ %328, %.._crit_edge_crit_edge ], [ %1912, %510 ]
+ %2265 = phi <2 x half> [ %329, %.._crit_edge_crit_edge ], [ %1913, %510 ]
+ %2266 = phi <2 x half> [ %330, %.._crit_edge_crit_edge ], [ %1914, %510 ]
+ %2267 = phi <2 x half> [ %331, %.._crit_edge_crit_edge ], [ %1915, %510 ]
+ %2268 = phi <2 x half> [ %332, %.._crit_edge_crit_edge ], [ %1916, %510 ]
+ %2269 = phi <2 x half> [ %333, %.._crit_edge_crit_edge ], [ %1917, %510 ]
+ %2270 = phi <2 x half> [ %334, %.._crit_edge_crit_edge ], [ %1918, %510 ]
+ %2271 = phi <2 x half> [ %335, %.._crit_edge_crit_edge ], [ %1919, %510 ]
+ %2272 = phi <2 x half> [ %336, %.._crit_edge_crit_edge ], [ %1920, %510 ]
+ %2273 = phi <2 x half> [ %337, %.._crit_edge_crit_edge ], [ %1921, %510 ]
+ %2274 = phi <2 x half> [ %338, %.._crit_edge_crit_edge ], [ %1922, %510 ]
+ %2275 = phi <2 x half> [ %339, %.._crit_edge_crit_edge ], [ %1923, %510 ]
+ %2276 = phi <2 x half> [ %340, %.._crit_edge_crit_edge ], [ %1924, %510 ]
+ %2277 = phi <2 x half> [ %341, %.._crit_edge_crit_edge ], [ %1925, %510 ]
+ %2278 = phi <2 x half> [ %342, %.._crit_edge_crit_edge ], [ %1926, %510 ]
+ %2279 = phi <2 x half> [ %343, %.._crit_edge_crit_edge ], [ %1927, %510 ]
+ %2280 = phi <2 x half> [ %344, %.._crit_edge_crit_edge ], [ %1928, %510 ]
+ %2281 = phi <2 x half> [ %345, %.._crit_edge_crit_edge ], [ %1929, %510 ]
+ %2282 = phi <2 x half> [ %346, %.._crit_edge_crit_edge ], [ %1930, %510 ]
+ %2283 = phi <2 x half> [ %347, %.._crit_edge_crit_edge ], [ %1931, %510 ]
+ %2284 = phi <2 x half> [ %348, %.._crit_edge_crit_edge ], [ %1932, %510 ]
+ %2285 = phi <2 x half> [ %349, %.._crit_edge_crit_edge ], [ %1933, %510 ]
+ %2286 = phi <2 x half> [ %350, %.._crit_edge_crit_edge ], [ %1934, %510 ]
+ %2287 = phi <2 x half> [ %351, %.._crit_edge_crit_edge ], [ %1935, %510 ]
+ %2288 = phi <2 x half> [ %352, %.._crit_edge_crit_edge ], [ %1936, %510 ]
+ %2289 = phi <2 x half> [ %353, %.._crit_edge_crit_edge ], [ %1937, %510 ]
+ %2290 = phi <2 x half> [ %354, %.._crit_edge_crit_edge ], [ %1938, %510 ]
+ %2291 = phi <2 x half> [ %355, %.._crit_edge_crit_edge ], [ %1939, %510 ]
+ %2292 = phi <2 x half> [ %356, %.._crit_edge_crit_edge ], [ %1940, %510 ]
+ %2293 = phi <2 x half> [ %357, %.._crit_edge_crit_edge ], [ %1941, %510 ]
+ %2294 = phi <2 x half> [ %358, %.._crit_edge_crit_edge ], [ %1942, %510 ]
+ %2295 = and i32 %237, 28
+ %2296 = or disjoint i32 %2295, 224
+ %2297 = or disjoint i32 %2295, 192
+ %2298 = or disjoint i32 %2295, 160
+ %2299 = or disjoint i32 %2295, 128
+ %2300 = or disjoint i32 %2295, 96
+ %2301 = or disjoint i32 %2295, 64
+ %2302 = or disjoint i32 %2295, 32
+ %2303 = or disjoint i32 %225, 224
+ %2304 = or disjoint i32 %225, 192
+ %2305 = or disjoint i32 %225, 160
+ %2306 = or disjoint i32 %225, 128
+ %2307 = or disjoint i32 %225, 96
+ %2308 = or disjoint i32 %225, 64
+ %2309 = or disjoint i32 %225, 32
+ %2310 = shufflevector <2 x half> %2231, <2 x half> %2232, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2311 = shufflevector <2 x half> %2233, <2 x half> %2234, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2312 = shufflevector <2 x half> %2235, <2 x half> %2236, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2313 = shufflevector <2 x half> %2237, <2 x half> %2238, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2314 = shufflevector <2 x half> %2239, <2 x half> %2240, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2315 = shufflevector <2 x half> %2241, <2 x half> %2242, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2316 = shufflevector <2 x half> %2243, <2 x half> %2244, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2317 = shufflevector <2 x half> %2245, <2 x half> %2246, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2318 = insertelement <4 x float> poison, float %1943, i64 0
+ %2319 = insertelement <4 x float> %2318, float %1944, i64 1
+ %2320 = insertelement <4 x float> %2319, float %1945, i64 2
+ %2321 = insertelement <4 x float> %2320, float %1946, i64 3
+ %2322 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2310, <4 x float> %2321, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2323 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2311, <4 x float> %2322, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2324 = insertelement <4 x float> poison, float %1947, i64 0
+ %2325 = insertelement <4 x float> %2324, float %1948, i64 1
+ %2326 = insertelement <4 x float> %2325, float %1949, i64 2
+ %2327 = insertelement <4 x float> %2326, float %1950, i64 3
+ %2328 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2310, <4 x float> %2327, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2329 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2311, <4 x float> %2328, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2330 = insertelement <4 x float> poison, float %1951, i64 0
+ %2331 = insertelement <4 x float> %2330, float %1952, i64 1
+ %2332 = insertelement <4 x float> %2331, float %1953, i64 2
+ %2333 = insertelement <4 x float> %2332, float %1954, i64 3
+ %2334 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2312, <4 x float> %2333, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2335 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2313, <4 x float> %2334, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2336 = insertelement <4 x float> poison, float %1955, i64 0
+ %2337 = insertelement <4 x float> %2336, float %1956, i64 1
+ %2338 = insertelement <4 x float> %2337, float %1957, i64 2
+ %2339 = insertelement <4 x float> %2338, float %1958, i64 3
+ %2340 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2312, <4 x float> %2339, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2341 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2313, <4 x float> %2340, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2342 = shufflevector <2 x half> %2255, <2 x half> %2256, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2343 = shufflevector <2 x half> %2257, <2 x half> %2258, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2344 = shufflevector <2 x half> %2259, <2 x half> %2260, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2345 = shufflevector <2 x half> %2261, <2 x half> %2262, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2346 = insertelement <4 x float> poison, float %1959, i64 0
+ %2347 = insertelement <4 x float> %2346, float %1960, i64 1
+ %2348 = insertelement <4 x float> %2347, float %1961, i64 2
+ %2349 = insertelement <4 x float> %2348, float %1962, i64 3
+ %2350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2310, <4 x float> %2349, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2311, <4 x float> %2350, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2352 = insertelement <4 x float> poison, float %1963, i64 0
+ %2353 = insertelement <4 x float> %2352, float %1964, i64 1
+ %2354 = insertelement <4 x float> %2353, float %1965, i64 2
+ %2355 = insertelement <4 x float> %2354, float %1966, i64 3
+ %2356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2310, <4 x float> %2355, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2311, <4 x float> %2356, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2358 = insertelement <4 x float> poison, float %1967, i64 0
+ %2359 = insertelement <4 x float> %2358, float %1968, i64 1
+ %2360 = insertelement <4 x float> %2359, float %1969, i64 2
+ %2361 = insertelement <4 x float> %2360, float %1970, i64 3
+ %2362 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2312, <4 x float> %2361, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2363 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2313, <4 x float> %2362, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2364 = insertelement <4 x float> poison, float %1971, i64 0
+ %2365 = insertelement <4 x float> %2364, float %1972, i64 1
+ %2366 = insertelement <4 x float> %2365, float %1973, i64 2
+ %2367 = insertelement <4 x float> %2366, float %1974, i64 3
+ %2368 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2312, <4 x float> %2367, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2369 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2313, <4 x float> %2368, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2370 = shufflevector <2 x half> %2247, <2 x half> %2248, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2371 = shufflevector <2 x half> %2249, <2 x half> %2250, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2372 = shufflevector <2 x half> %2251, <2 x half> %2252, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2373 = shufflevector <2 x half> %2253, <2 x half> %2254, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2374 = insertelement <4 x float> poison, float %2007, i64 0
+ %2375 = insertelement <4 x float> %2374, float %2008, i64 1
+ %2376 = insertelement <4 x float> %2375, float %2009, i64 2
+ %2377 = insertelement <4 x float> %2376, float %2010, i64 3
+ %2378 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2370, <4 x float> %2377, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2379 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2371, <4 x float> %2378, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2380 = insertelement <4 x float> poison, float %2011, i64 0
+ %2381 = insertelement <4 x float> %2380, float %2012, i64 1
+ %2382 = insertelement <4 x float> %2381, float %2013, i64 2
+ %2383 = insertelement <4 x float> %2382, float %2014, i64 3
+ %2384 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2370, <4 x float> %2383, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2385 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2371, <4 x float> %2384, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2386 = insertelement <4 x float> poison, float %2015, i64 0
+ %2387 = insertelement <4 x float> %2386, float %2016, i64 1
+ %2388 = insertelement <4 x float> %2387, float %2017, i64 2
+ %2389 = insertelement <4 x float> %2388, float %2018, i64 3
+ %2390 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2372, <4 x float> %2389, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2391 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2373, <4 x float> %2390, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2392 = insertelement <4 x float> poison, float %2019, i64 0
+ %2393 = insertelement <4 x float> %2392, float %2020, i64 1
+ %2394 = insertelement <4 x float> %2393, float %2021, i64 2
+ %2395 = insertelement <4 x float> %2394, float %2022, i64 3
+ %2396 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2372, <4 x float> %2395, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2397 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2373, <4 x float> %2396, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2398 = insertelement <4 x float> poison, float %2023, i64 0
+ %2399 = insertelement <4 x float> %2398, float %2024, i64 1
+ %2400 = insertelement <4 x float> %2399, float %2025, i64 2
+ %2401 = insertelement <4 x float> %2400, float %2026, i64 3
+ %2402 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2370, <4 x float> %2401, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2403 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2371, <4 x float> %2402, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2404 = insertelement <4 x float> poison, float %2027, i64 0
+ %2405 = insertelement <4 x float> %2404, float %2028, i64 1
+ %2406 = insertelement <4 x float> %2405, float %2029, i64 2
+ %2407 = insertelement <4 x float> %2406, float %2030, i64 3
+ %2408 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2370, <4 x float> %2407, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2409 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2371, <4 x float> %2408, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2410 = insertelement <4 x float> poison, float %2031, i64 0
+ %2411 = insertelement <4 x float> %2410, float %2032, i64 1
+ %2412 = insertelement <4 x float> %2411, float %2033, i64 2
+ %2413 = insertelement <4 x float> %2412, float %2034, i64 3
+ %2414 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2372, <4 x float> %2413, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2415 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2373, <4 x float> %2414, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2416 = insertelement <4 x float> poison, float %2035, i64 0
+ %2417 = insertelement <4 x float> %2416, float %2036, i64 1
+ %2418 = insertelement <4 x float> %2417, float %2037, i64 2
+ %2419 = insertelement <4 x float> %2418, float %2038, i64 3
+ %2420 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2372, <4 x float> %2419, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2421 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2373, <4 x float> %2420, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0), !dbg !111
+ %2422 = or disjoint i32 %.pre-phi1014, 2048
+ %2423 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1014
+ %2424 = load <8 x half>, ptr addrspace(3) %2423, align 16
+ %2425 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2422
+ %2426 = load <8 x half>, ptr addrspace(3) %2425, align 16
+ %2427 = shufflevector <8 x half> %2424, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2428 = shufflevector <8 x half> %2424, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2429 = shufflevector <8 x half> %2426, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2430 = shufflevector <8 x half> %2426, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2431 = insertelement <4 x float> poison, float %1975, i64 0
+ %2432 = insertelement <4 x float> %2431, float %1976, i64 1
+ %2433 = insertelement <4 x float> %2432, float %1977, i64 2
+ %2434 = insertelement <4 x float> %2433, float %1978, i64 3
+ %2435 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2310, <4 x float> %2434, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2436 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2311, <4 x float> %2435, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2437 = insertelement <4 x float> poison, float %1979, i64 0
+ %2438 = insertelement <4 x float> %2437, float %1980, i64 1
+ %2439 = insertelement <4 x float> %2438, float %1981, i64 2
+ %2440 = insertelement <4 x float> %2439, float %1982, i64 3
+ %2441 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2310, <4 x float> %2440, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2442 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2311, <4 x float> %2441, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2443 = insertelement <4 x float> poison, float %1983, i64 0
+ %2444 = insertelement <4 x float> %2443, float %1984, i64 1
+ %2445 = insertelement <4 x float> %2444, float %1985, i64 2
+ %2446 = insertelement <4 x float> %2445, float %1986, i64 3
+ %2447 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2312, <4 x float> %2446, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2448 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2313, <4 x float> %2447, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2449 = insertelement <4 x float> poison, float %1987, i64 0
+ %2450 = insertelement <4 x float> %2449, float %1988, i64 1
+ %2451 = insertelement <4 x float> %2450, float %1989, i64 2
+ %2452 = insertelement <4 x float> %2451, float %1990, i64 3
+ %2453 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2312, <4 x float> %2452, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2454 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2313, <4 x float> %2453, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2455 = or disjoint i32 %.pre-phi1018, 2048
+ %2456 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1018
+ %2457 = load <8 x half>, ptr addrspace(3) %2456, align 16
+ %2458 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2455
+ %2459 = load <8 x half>, ptr addrspace(3) %2458, align 16
+ %2460 = shufflevector <8 x half> %2457, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2461 = shufflevector <8 x half> %2457, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2462 = shufflevector <8 x half> %2459, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2463 = shufflevector <8 x half> %2459, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2464 = insertelement <4 x float> poison, float %1991, i64 0
+ %2465 = insertelement <4 x float> %2464, float %1992, i64 1
+ %2466 = insertelement <4 x float> %2465, float %1993, i64 2
+ %2467 = insertelement <4 x float> %2466, float %1994, i64 3
+ %2468 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2310, <4 x float> %2467, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2469 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2311, <4 x float> %2468, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2470 = insertelement <4 x float> poison, float %1995, i64 0
+ %2471 = insertelement <4 x float> %2470, float %1996, i64 1
+ %2472 = insertelement <4 x float> %2471, float %1997, i64 2
+ %2473 = insertelement <4 x float> %2472, float %1998, i64 3
+ %2474 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2310, <4 x float> %2473, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2475 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2311, <4 x float> %2474, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2476 = insertelement <4 x float> poison, float %1999, i64 0
+ %2477 = insertelement <4 x float> %2476, float %2000, i64 1
+ %2478 = insertelement <4 x float> %2477, float %2001, i64 2
+ %2479 = insertelement <4 x float> %2478, float %2002, i64 3
+ %2480 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2312, <4 x float> %2479, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2481 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2313, <4 x float> %2480, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2482 = insertelement <4 x float> poison, float %2003, i64 0
+ %2483 = insertelement <4 x float> %2482, float %2004, i64 1
+ %2484 = insertelement <4 x float> %2483, float %2005, i64 2
+ %2485 = insertelement <4 x float> %2484, float %2006, i64 3
+ %2486 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2312, <4 x float> %2485, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2487 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2313, <4 x float> %2486, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2488 = insertelement <4 x float> poison, float %2039, i64 0
+ %2489 = insertelement <4 x float> %2488, float %2040, i64 1
+ %2490 = insertelement <4 x float> %2489, float %2041, i64 2
+ %2491 = insertelement <4 x float> %2490, float %2042, i64 3
+ %2492 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2370, <4 x float> %2491, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2493 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2371, <4 x float> %2492, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2494 = insertelement <4 x float> poison, float %2043, i64 0
+ %2495 = insertelement <4 x float> %2494, float %2044, i64 1
+ %2496 = insertelement <4 x float> %2495, float %2045, i64 2
+ %2497 = insertelement <4 x float> %2496, float %2046, i64 3
+ %2498 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2370, <4 x float> %2497, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2499 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2371, <4 x float> %2498, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2500 = insertelement <4 x float> poison, float %2047, i64 0
+ %2501 = insertelement <4 x float> %2500, float %2048, i64 1
+ %2502 = insertelement <4 x float> %2501, float %2049, i64 2
+ %2503 = insertelement <4 x float> %2502, float %2050, i64 3
+ %2504 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2372, <4 x float> %2503, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2505 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2373, <4 x float> %2504, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2506 = insertelement <4 x float> poison, float %2051, i64 0
+ %2507 = insertelement <4 x float> %2506, float %2052, i64 1
+ %2508 = insertelement <4 x float> %2507, float %2053, i64 2
+ %2509 = insertelement <4 x float> %2508, float %2054, i64 3
+ %2510 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2372, <4 x float> %2509, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2511 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2373, <4 x float> %2510, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2512 = insertelement <4 x float> poison, float %2055, i64 0
+ %2513 = insertelement <4 x float> %2512, float %2056, i64 1
+ %2514 = insertelement <4 x float> %2513, float %2057, i64 2
+ %2515 = insertelement <4 x float> %2514, float %2058, i64 3
+ %2516 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2370, <4 x float> %2515, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2517 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2371, <4 x float> %2516, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2518 = insertelement <4 x float> poison, float %2059, i64 0
+ %2519 = insertelement <4 x float> %2518, float %2060, i64 1
+ %2520 = insertelement <4 x float> %2519, float %2061, i64 2
+ %2521 = insertelement <4 x float> %2520, float %2062, i64 3
+ %2522 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2370, <4 x float> %2521, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2523 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2371, <4 x float> %2522, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2524 = insertelement <4 x float> poison, float %2063, i64 0
+ %2525 = insertelement <4 x float> %2524, float %2064, i64 1
+ %2526 = insertelement <4 x float> %2525, float %2065, i64 2
+ %2527 = insertelement <4 x float> %2526, float %2066, i64 3
+ %2528 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2372, <4 x float> %2527, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2529 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2373, <4 x float> %2528, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2530 = insertelement <4 x float> poison, float %2067, i64 0
+ %2531 = insertelement <4 x float> %2530, float %2068, i64 1
+ %2532 = insertelement <4 x float> %2531, float %2069, i64 2
+ %2533 = insertelement <4 x float> %2532, float %2070, i64 3
+ %2534 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2372, <4 x float> %2533, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2535 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2373, <4 x float> %2534, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %2536 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1022
+ %2537 = load <8 x half>, ptr addrspace(3) %2536, align 16
+ %2538 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1026
+ %2539 = load <8 x half>, ptr addrspace(3) %2538, align 16
+ %2540 = shufflevector <8 x half> %2537, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2541 = shufflevector <8 x half> %2537, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2542 = shufflevector <8 x half> %2539, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2543 = shufflevector <8 x half> %2539, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2544 = insertelement <4 x float> poison, float %2071, i64 0
+ %2545 = insertelement <4 x float> %2544, float %2072, i64 1
+ %2546 = insertelement <4 x float> %2545, float %2073, i64 2
+ %2547 = insertelement <4 x float> %2546, float %2074, i64 3
+ %2548 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2540, <4 x float> %2547, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2549 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2541, <4 x float> %2548, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2550 = insertelement <4 x float> poison, float %2075, i64 0
+ %2551 = insertelement <4 x float> %2550, float %2076, i64 1
+ %2552 = insertelement <4 x float> %2551, float %2077, i64 2
+ %2553 = insertelement <4 x float> %2552, float %2078, i64 3
+ %2554 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2540, <4 x float> %2553, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2555 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2541, <4 x float> %2554, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2556 = insertelement <4 x float> poison, float %2079, i64 0
+ %2557 = insertelement <4 x float> %2556, float %2080, i64 1
+ %2558 = insertelement <4 x float> %2557, float %2081, i64 2
+ %2559 = insertelement <4 x float> %2558, float %2082, i64 3
+ %2560 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2542, <4 x float> %2559, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2561 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2543, <4 x float> %2560, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2562 = insertelement <4 x float> poison, float %2083, i64 0
+ %2563 = insertelement <4 x float> %2562, float %2084, i64 1
+ %2564 = insertelement <4 x float> %2563, float %2085, i64 2
+ %2565 = insertelement <4 x float> %2564, float %2086, i64 3
+ %2566 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2542, <4 x float> %2565, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2567 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2543, <4 x float> %2566, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2568 = insertelement <4 x float> poison, float %2087, i64 0
+ %2569 = insertelement <4 x float> %2568, float %2088, i64 1
+ %2570 = insertelement <4 x float> %2569, float %2089, i64 2
+ %2571 = insertelement <4 x float> %2570, float %2090, i64 3
+ %2572 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2540, <4 x float> %2571, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2573 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2541, <4 x float> %2572, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2574 = insertelement <4 x float> poison, float %2091, i64 0
+ %2575 = insertelement <4 x float> %2574, float %2092, i64 1
+ %2576 = insertelement <4 x float> %2575, float %2093, i64 2
+ %2577 = insertelement <4 x float> %2576, float %2094, i64 3
+ %2578 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2540, <4 x float> %2577, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2579 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2541, <4 x float> %2578, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2580 = insertelement <4 x float> poison, float %2095, i64 0
+ %2581 = insertelement <4 x float> %2580, float %2096, i64 1
+ %2582 = insertelement <4 x float> %2581, float %2097, i64 2
+ %2583 = insertelement <4 x float> %2582, float %2098, i64 3
+ %2584 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2542, <4 x float> %2583, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2585 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2543, <4 x float> %2584, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2586 = insertelement <4 x float> poison, float %2099, i64 0
+ %2587 = insertelement <4 x float> %2586, float %2100, i64 1
+ %2588 = insertelement <4 x float> %2587, float %2101, i64 2
+ %2589 = insertelement <4 x float> %2588, float %2102, i64 3
+ %2590 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2542, <4 x float> %2589, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2591 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2543, <4 x float> %2590, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2592 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1030
+ %2593 = load <8 x half>, ptr addrspace(3) %2592, align 16
+ %2594 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1034
+ %2595 = load <8 x half>, ptr addrspace(3) %2594, align 16
+ %2596 = shufflevector <8 x half> %2593, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2597 = shufflevector <8 x half> %2593, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2598 = shufflevector <8 x half> %2595, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2599 = shufflevector <8 x half> %2595, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2600 = insertelement <4 x float> poison, float %2135, i64 0
+ %2601 = insertelement <4 x float> %2600, float %2136, i64 1
+ %2602 = insertelement <4 x float> %2601, float %2137, i64 2
+ %2603 = insertelement <4 x float> %2602, float %2138, i64 3
+ %2604 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2596, <4 x float> %2603, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2605 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2597, <4 x float> %2604, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2606 = insertelement <4 x float> poison, float %2139, i64 0
+ %2607 = insertelement <4 x float> %2606, float %2140, i64 1
+ %2608 = insertelement <4 x float> %2607, float %2141, i64 2
+ %2609 = insertelement <4 x float> %2608, float %2142, i64 3
+ %2610 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2596, <4 x float> %2609, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2611 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2597, <4 x float> %2610, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2612 = insertelement <4 x float> poison, float %2143, i64 0
+ %2613 = insertelement <4 x float> %2612, float %2144, i64 1
+ %2614 = insertelement <4 x float> %2613, float %2145, i64 2
+ %2615 = insertelement <4 x float> %2614, float %2146, i64 3
+ %2616 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2598, <4 x float> %2615, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2617 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2599, <4 x float> %2616, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2618 = insertelement <4 x float> poison, float %2147, i64 0
+ %2619 = insertelement <4 x float> %2618, float %2148, i64 1
+ %2620 = insertelement <4 x float> %2619, float %2149, i64 2
+ %2621 = insertelement <4 x float> %2620, float %2150, i64 3
+ %2622 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2598, <4 x float> %2621, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2623 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2599, <4 x float> %2622, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2624 = insertelement <4 x float> poison, float %2151, i64 0
+ %2625 = insertelement <4 x float> %2624, float %2152, i64 1
+ %2626 = insertelement <4 x float> %2625, float %2153, i64 2
+ %2627 = insertelement <4 x float> %2626, float %2154, i64 3
+ %2628 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2596, <4 x float> %2627, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2629 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2597, <4 x float> %2628, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2630 = insertelement <4 x float> poison, float %2155, i64 0
+ %2631 = insertelement <4 x float> %2630, float %2156, i64 1
+ %2632 = insertelement <4 x float> %2631, float %2157, i64 2
+ %2633 = insertelement <4 x float> %2632, float %2158, i64 3
+ %2634 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2596, <4 x float> %2633, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2635 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2597, <4 x float> %2634, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2636 = insertelement <4 x float> poison, float %2159, i64 0
+ %2637 = insertelement <4 x float> %2636, float %2160, i64 1
+ %2638 = insertelement <4 x float> %2637, float %2161, i64 2
+ %2639 = insertelement <4 x float> %2638, float %2162, i64 3
+ %2640 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2598, <4 x float> %2639, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2641 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2599, <4 x float> %2640, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2642 = insertelement <4 x float> poison, float %2163, i64 0
+ %2643 = insertelement <4 x float> %2642, float %2164, i64 1
+ %2644 = insertelement <4 x float> %2643, float %2165, i64 2
+ %2645 = insertelement <4 x float> %2644, float %2166, i64 3
+ %2646 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2598, <4 x float> %2645, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2647 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2599, <4 x float> %2646, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %2648 = insertelement <4 x float> poison, float %2103, i64 0
+ %2649 = insertelement <4 x float> %2648, float %2104, i64 1
+ %2650 = insertelement <4 x float> %2649, float %2105, i64 2
+ %2651 = insertelement <4 x float> %2650, float %2106, i64 3
+ %2652 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2540, <4 x float> %2651, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2653 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2541, <4 x float> %2652, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2654 = insertelement <4 x float> poison, float %2107, i64 0
+ %2655 = insertelement <4 x float> %2654, float %2108, i64 1
+ %2656 = insertelement <4 x float> %2655, float %2109, i64 2
+ %2657 = insertelement <4 x float> %2656, float %2110, i64 3
+ %2658 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2540, <4 x float> %2657, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2659 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2541, <4 x float> %2658, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2660 = insertelement <4 x float> poison, float %2111, i64 0
+ %2661 = insertelement <4 x float> %2660, float %2112, i64 1
+ %2662 = insertelement <4 x float> %2661, float %2113, i64 2
+ %2663 = insertelement <4 x float> %2662, float %2114, i64 3
+ %2664 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2542, <4 x float> %2663, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2665 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2543, <4 x float> %2664, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2666 = insertelement <4 x float> poison, float %2115, i64 0
+ %2667 = insertelement <4 x float> %2666, float %2116, i64 1
+ %2668 = insertelement <4 x float> %2667, float %2117, i64 2
+ %2669 = insertelement <4 x float> %2668, float %2118, i64 3
+ %2670 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2542, <4 x float> %2669, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2671 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2543, <4 x float> %2670, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2672 = insertelement <4 x float> poison, float %2119, i64 0
+ %2673 = insertelement <4 x float> %2672, float %2120, i64 1
+ %2674 = insertelement <4 x float> %2673, float %2121, i64 2
+ %2675 = insertelement <4 x float> %2674, float %2122, i64 3
+ %2676 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2540, <4 x float> %2675, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2677 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2541, <4 x float> %2676, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2678 = insertelement <4 x float> poison, float %2123, i64 0
+ %2679 = insertelement <4 x float> %2678, float %2124, i64 1
+ %2680 = insertelement <4 x float> %2679, float %2125, i64 2
+ %2681 = insertelement <4 x float> %2680, float %2126, i64 3
+ %2682 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2540, <4 x float> %2681, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2683 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2541, <4 x float> %2682, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2684 = insertelement <4 x float> poison, float %2127, i64 0
+ %2685 = insertelement <4 x float> %2684, float %2128, i64 1
+ %2686 = insertelement <4 x float> %2685, float %2129, i64 2
+ %2687 = insertelement <4 x float> %2686, float %2130, i64 3
+ %2688 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2542, <4 x float> %2687, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2689 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2543, <4 x float> %2688, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2690 = insertelement <4 x float> poison, float %2131, i64 0
+ %2691 = insertelement <4 x float> %2690, float %2132, i64 1
+ %2692 = insertelement <4 x float> %2691, float %2133, i64 2
+ %2693 = insertelement <4 x float> %2692, float %2134, i64 3
+ %2694 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2542, <4 x float> %2693, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2695 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2543, <4 x float> %2694, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2696 = insertelement <4 x float> poison, float %2167, i64 0
+ %2697 = insertelement <4 x float> %2696, float %2168, i64 1
+ %2698 = insertelement <4 x float> %2697, float %2169, i64 2
+ %2699 = insertelement <4 x float> %2698, float %2170, i64 3
+ %2700 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2596, <4 x float> %2699, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2701 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2597, <4 x float> %2700, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2702 = insertelement <4 x float> poison, float %2171, i64 0
+ %2703 = insertelement <4 x float> %2702, float %2172, i64 1
+ %2704 = insertelement <4 x float> %2703, float %2173, i64 2
+ %2705 = insertelement <4 x float> %2704, float %2174, i64 3
+ %2706 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2596, <4 x float> %2705, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2707 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2597, <4 x float> %2706, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2708 = insertelement <4 x float> poison, float %2175, i64 0
+ %2709 = insertelement <4 x float> %2708, float %2176, i64 1
+ %2710 = insertelement <4 x float> %2709, float %2177, i64 2
+ %2711 = insertelement <4 x float> %2710, float %2178, i64 3
+ %2712 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2598, <4 x float> %2711, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2713 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2599, <4 x float> %2712, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2714 = insertelement <4 x float> poison, float %2179, i64 0
+ %2715 = insertelement <4 x float> %2714, float %2180, i64 1
+ %2716 = insertelement <4 x float> %2715, float %2181, i64 2
+ %2717 = insertelement <4 x float> %2716, float %2182, i64 3
+ %2718 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2598, <4 x float> %2717, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2719 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2599, <4 x float> %2718, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2720 = insertelement <4 x float> poison, float %2183, i64 0
+ %2721 = insertelement <4 x float> %2720, float %2184, i64 1
+ %2722 = insertelement <4 x float> %2721, float %2185, i64 2
+ %2723 = insertelement <4 x float> %2722, float %2186, i64 3
+ %2724 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2596, <4 x float> %2723, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2725 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2597, <4 x float> %2724, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2726 = insertelement <4 x float> poison, float %2187, i64 0
+ %2727 = insertelement <4 x float> %2726, float %2188, i64 1
+ %2728 = insertelement <4 x float> %2727, float %2189, i64 2
+ %2729 = insertelement <4 x float> %2728, float %2190, i64 3
+ %2730 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2596, <4 x float> %2729, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2731 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2597, <4 x float> %2730, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2732 = insertelement <4 x float> poison, float %2191, i64 0
+ %2733 = insertelement <4 x float> %2732, float %2192, i64 1
+ %2734 = insertelement <4 x float> %2733, float %2193, i64 2
+ %2735 = insertelement <4 x float> %2734, float %2194, i64 3
+ %2736 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2598, <4 x float> %2735, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2737 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2599, <4 x float> %2736, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2738 = insertelement <4 x float> poison, float %2195, i64 0
+ %2739 = insertelement <4 x float> %2738, float %2196, i64 1
+ %2740 = insertelement <4 x float> %2739, float %2197, i64 2
+ %2741 = insertelement <4 x float> %2740, float %2198, i64 3
+ %2742 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2598, <4 x float> %2741, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2743 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2599, <4 x float> %2742, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %2744 = or disjoint i32 %.pre-phi1042, 2048
+ %2745 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1042
+ %2746 = load <8 x half>, ptr addrspace(3) %2745, align 16
+ %2747 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2744
+ %2748 = load <8 x half>, ptr addrspace(3) %2747, align 16
+ %2749 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1044
+ %2750 = load <8 x half>, ptr addrspace(3) %2749, align 16
+ %2751 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1046
+ %2752 = load <8 x half>, ptr addrspace(3) %2751, align 16
+ %2753 = shufflevector <8 x half> %2750, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2754 = shufflevector <8 x half> %2750, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2755 = shufflevector <8 x half> %2752, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2756 = shufflevector <8 x half> %2752, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2757 = shufflevector <8 x half> %2746, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2758 = shufflevector <8 x half> %2746, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2759 = shufflevector <8 x half> %2748, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2760 = shufflevector <8 x half> %2748, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2761 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2753, <4 x float> %2323, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2762 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2754, <4 x float> %2761, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2763 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2753, <4 x float> %2329, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2764 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2754, <4 x float> %2763, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2765 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2755, <4 x float> %2335, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2766 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2756, <4 x float> %2765, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2767 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2755, <4 x float> %2341, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2768 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2756, <4 x float> %2767, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2769 = or disjoint i32 %.pre-phi1048, 2048
+ %2770 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1048
+ %2771 = load <8 x half>, ptr addrspace(3) %2770, align 16
+ %2772 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2769
+ %2773 = load <8 x half>, ptr addrspace(3) %2772, align 16
+ %2774 = shufflevector <8 x half> %2771, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2775 = shufflevector <8 x half> %2771, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2776 = shufflevector <8 x half> %2773, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2777 = shufflevector <8 x half> %2773, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2778 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2753, <4 x float> %2351, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2779 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2754, <4 x float> %2778, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2780 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2753, <4 x float> %2357, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2781 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2754, <4 x float> %2780, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2782 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2755, <4 x float> %2363, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2783 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2756, <4 x float> %2782, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2784 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2755, <4 x float> %2369, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2785 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2756, <4 x float> %2784, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2786 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1050
+ %2787 = load <8 x half>, ptr addrspace(3) %2786, align 16
+ %2788 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1052
+ %2789 = load <8 x half>, ptr addrspace(3) %2788, align 16
+ %2790 = shufflevector <8 x half> %2787, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2791 = shufflevector <8 x half> %2787, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2792 = shufflevector <8 x half> %2789, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2793 = shufflevector <8 x half> %2789, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2794 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2790, <4 x float> %2379, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2795 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2791, <4 x float> %2794, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2796 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2790, <4 x float> %2385, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2797 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2791, <4 x float> %2796, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2798 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2792, <4 x float> %2391, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2799 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2793, <4 x float> %2798, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2800 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2792, <4 x float> %2397, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2801 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2793, <4 x float> %2800, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2802 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2790, <4 x float> %2403, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2803 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2791, <4 x float> %2802, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2804 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2790, <4 x float> %2409, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2805 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2791, <4 x float> %2804, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2806 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2792, <4 x float> %2415, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2807 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2793, <4 x float> %2806, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2808 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2792, <4 x float> %2421, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2809 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2793, <4 x float> %2808, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %2810 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1054
+ %2811 = load <8 x half>, ptr addrspace(3) %2810, align 16
+ %2812 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1056
+ %2813 = load <8 x half>, ptr addrspace(3) %2812, align 16
+ %2814 = shufflevector <8 x half> %2811, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2815 = shufflevector <8 x half> %2811, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2816 = shufflevector <8 x half> %2813, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2817 = shufflevector <8 x half> %2813, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2818 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2753, <4 x float> %2436, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2819 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2754, <4 x float> %2818, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2820 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2753, <4 x float> %2442, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2821 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2754, <4 x float> %2820, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2822 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2755, <4 x float> %2448, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2823 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2756, <4 x float> %2822, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2824 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2755, <4 x float> %2454, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2825 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2756, <4 x float> %2824, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2826 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1058
+ %2827 = load <8 x half>, ptr addrspace(3) %2826, align 16
+ %2828 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1060
+ %2829 = load <8 x half>, ptr addrspace(3) %2828, align 16
+ %2830 = shufflevector <8 x half> %2827, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2831 = shufflevector <8 x half> %2827, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2832 = shufflevector <8 x half> %2829, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2833 = shufflevector <8 x half> %2829, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2834 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2753, <4 x float> %2469, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2835 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2754, <4 x float> %2834, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2836 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2753, <4 x float> %2475, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2837 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2754, <4 x float> %2836, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2838 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2755, <4 x float> %2481, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2839 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2756, <4 x float> %2838, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2840 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2755, <4 x float> %2487, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2841 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2756, <4 x float> %2840, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2842 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2790, <4 x float> %2493, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2843 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2791, <4 x float> %2842, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2844 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2790, <4 x float> %2499, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2845 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2791, <4 x float> %2844, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2846 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2792, <4 x float> %2505, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2847 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2793, <4 x float> %2846, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2848 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2792, <4 x float> %2511, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2849 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2793, <4 x float> %2848, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2850 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2790, <4 x float> %2517, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2851 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2791, <4 x float> %2850, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2852 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2790, <4 x float> %2523, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2853 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2791, <4 x float> %2852, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2854 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2792, <4 x float> %2529, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2855 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2793, <4 x float> %2854, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2856 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2792, <4 x float> %2535, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2857 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2793, <4 x float> %2856, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %2858 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1062
+ %2859 = load <8 x half>, ptr addrspace(3) %2858, align 16
+ %2860 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1064
+ %2861 = load <8 x half>, ptr addrspace(3) %2860, align 16
+ %2862 = shufflevector <8 x half> %2859, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2863 = shufflevector <8 x half> %2859, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2864 = shufflevector <8 x half> %2861, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2865 = shufflevector <8 x half> %2861, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2866 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2862, <4 x float> %2549, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2867 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2863, <4 x float> %2866, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2868 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2862, <4 x float> %2555, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2869 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2863, <4 x float> %2868, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2870 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2864, <4 x float> %2561, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2871 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2865, <4 x float> %2870, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2872 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2864, <4 x float> %2567, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2873 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2865, <4 x float> %2872, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2874 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2862, <4 x float> %2573, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2875 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2863, <4 x float> %2874, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2876 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2862, <4 x float> %2579, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2877 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2863, <4 x float> %2876, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2878 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2864, <4 x float> %2585, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2879 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2865, <4 x float> %2878, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2880 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2864, <4 x float> %2591, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2881 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2865, <4 x float> %2880, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2882 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1066
+ %2883 = load <8 x half>, ptr addrspace(3) %2882, align 16
+ %2884 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1068
+ %2885 = load <8 x half>, ptr addrspace(3) %2884, align 16
+ %2886 = shufflevector <8 x half> %2883, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2887 = shufflevector <8 x half> %2883, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2888 = shufflevector <8 x half> %2885, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2889 = shufflevector <8 x half> %2885, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2890 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2886, <4 x float> %2605, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2891 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2887, <4 x float> %2890, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2892 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2886, <4 x float> %2611, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2893 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2887, <4 x float> %2892, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2894 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2888, <4 x float> %2617, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2895 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2889, <4 x float> %2894, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2896 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2888, <4 x float> %2623, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2897 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2889, <4 x float> %2896, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2898 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2886, <4 x float> %2629, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2899 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2887, <4 x float> %2898, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2900 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2886, <4 x float> %2635, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2901 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2887, <4 x float> %2900, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2902 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2888, <4 x float> %2641, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2903 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2889, <4 x float> %2902, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2904 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2888, <4 x float> %2647, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2905 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2889, <4 x float> %2904, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %2906 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2862, <4 x float> %2653, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2907 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2863, <4 x float> %2906, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2908 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2862, <4 x float> %2659, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2909 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2863, <4 x float> %2908, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2910 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2864, <4 x float> %2665, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2911 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2865, <4 x float> %2910, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2912 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2864, <4 x float> %2671, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2913 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2865, <4 x float> %2912, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2914 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2862, <4 x float> %2677, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2915 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2863, <4 x float> %2914, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2916 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2862, <4 x float> %2683, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2917 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2863, <4 x float> %2916, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2918 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2864, <4 x float> %2689, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2919 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2865, <4 x float> %2918, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2920 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2864, <4 x float> %2695, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2921 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2865, <4 x float> %2920, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2922 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2886, <4 x float> %2701, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2923 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2887, <4 x float> %2922, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2924 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2886, <4 x float> %2707, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2925 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2887, <4 x float> %2924, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2926 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2888, <4 x float> %2713, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2927 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2889, <4 x float> %2926, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2928 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2888, <4 x float> %2719, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2929 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2889, <4 x float> %2928, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2930 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2886, <4 x float> %2725, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2931 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2887, <4 x float> %2930, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2932 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2886, <4 x float> %2731, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2933 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2887, <4 x float> %2932, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2934 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2888, <4 x float> %2737, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2935 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2889, <4 x float> %2934, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2936 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2888, <4 x float> %2743, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %2937 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2889, <4 x float> %2936, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %2938 = shufflevector <2 x half> %2199, <2 x half> %2263, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2939 = shufflevector <2 x half> %2264, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2940 = shufflevector <8 x half> %2938, <8 x half> %2939, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2941 = shufflevector <2 x half> %2200, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2942 = shufflevector <8 x half> %2940, <8 x half> %2941, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2942, ptr addrspace(3) %199, align 16
+ %2943 = shufflevector <2 x half> %2201, <2 x half> %2265, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2944 = shufflevector <2 x half> %2266, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2945 = shufflevector <8 x half> %2943, <8 x half> %2944, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2946 = shufflevector <2 x half> %2202, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2947 = shufflevector <8 x half> %2945, <8 x half> %2946, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2947, ptr addrspace(3) %201, align 16
+ %2948 = shufflevector <2 x half> %2203, <2 x half> %2267, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2949 = shufflevector <2 x half> %2268, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2950 = shufflevector <8 x half> %2948, <8 x half> %2949, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2951 = shufflevector <2 x half> %2204, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2952 = shufflevector <8 x half> %2950, <8 x half> %2951, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2952, ptr addrspace(3) %203, align 16
+ %2953 = shufflevector <2 x half> %2205, <2 x half> %2269, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2954 = shufflevector <2 x half> %2270, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2955 = shufflevector <8 x half> %2953, <8 x half> %2954, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2956 = shufflevector <2 x half> %2206, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2957 = shufflevector <8 x half> %2955, <8 x half> %2956, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2957, ptr addrspace(3) %205, align 16
+ %2958 = shufflevector <2 x half> %2207, <2 x half> %2271, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2959 = shufflevector <2 x half> %2272, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2960 = shufflevector <8 x half> %2958, <8 x half> %2959, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2961 = shufflevector <2 x half> %2208, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2962 = shufflevector <8 x half> %2960, <8 x half> %2961, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2962, ptr addrspace(3) %207, align 16
+ %2963 = shufflevector <2 x half> %2209, <2 x half> %2273, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2964 = shufflevector <2 x half> %2274, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2965 = shufflevector <8 x half> %2963, <8 x half> %2964, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2966 = shufflevector <2 x half> %2210, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2967 = shufflevector <8 x half> %2965, <8 x half> %2966, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2967, ptr addrspace(3) %209, align 16
+ %2968 = shufflevector <2 x half> %2211, <2 x half> %2275, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2969 = shufflevector <2 x half> %2276, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2970 = shufflevector <8 x half> %2968, <8 x half> %2969, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2971 = shufflevector <2 x half> %2212, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2972 = shufflevector <8 x half> %2970, <8 x half> %2971, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2972, ptr addrspace(3) %211, align 16
+ %2973 = shufflevector <2 x half> %2213, <2 x half> %2277, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2974 = shufflevector <2 x half> %2278, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2975 = shufflevector <8 x half> %2973, <8 x half> %2974, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2976 = shufflevector <2 x half> %2214, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2977 = shufflevector <8 x half> %2975, <8 x half> %2976, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2977, ptr addrspace(3) %213, align 16
+ %2978 = shufflevector <2 x half> %2215, <2 x half> %2279, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2979 = shufflevector <2 x half> %2280, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2980 = shufflevector <8 x half> %2978, <8 x half> %2979, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2981 = shufflevector <2 x half> %2216, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2982 = shufflevector <8 x half> %2980, <8 x half> %2981, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2982, ptr addrspace(3) %214, align 16
+ %2983 = shufflevector <2 x half> %2217, <2 x half> %2281, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2984 = shufflevector <2 x half> %2282, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2985 = shufflevector <8 x half> %2983, <8 x half> %2984, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2986 = shufflevector <2 x half> %2218, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2987 = shufflevector <8 x half> %2985, <8 x half> %2986, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2987, ptr addrspace(3) %215, align 16
+ %2988 = shufflevector <2 x half> %2219, <2 x half> %2283, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2989 = shufflevector <2 x half> %2284, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2990 = shufflevector <8 x half> %2988, <8 x half> %2989, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2991 = shufflevector <2 x half> %2220, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2992 = shufflevector <8 x half> %2990, <8 x half> %2991, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2992, ptr addrspace(3) %216, align 16
+ %2993 = shufflevector <2 x half> %2221, <2 x half> %2285, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2994 = shufflevector <2 x half> %2286, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2995 = shufflevector <8 x half> %2993, <8 x half> %2994, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2996 = shufflevector <2 x half> %2222, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2997 = shufflevector <8 x half> %2995, <8 x half> %2996, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2997, ptr addrspace(3) %217, align 16
+ %2998 = shufflevector <2 x half> %2223, <2 x half> %2287, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2999 = shufflevector <2 x half> %2288, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3000 = shufflevector <8 x half> %2998, <8 x half> %2999, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %3001 = shufflevector <2 x half> %2224, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3002 = shufflevector <8 x half> %3000, <8 x half> %3001, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %3002, ptr addrspace(3) %218, align 16
+ %3003 = shufflevector <2 x half> %2225, <2 x half> %2289, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3004 = shufflevector <2 x half> %2290, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3005 = shufflevector <8 x half> %3003, <8 x half> %3004, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %3006 = shufflevector <2 x half> %2226, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3007 = shufflevector <8 x half> %3005, <8 x half> %3006, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %3007, ptr addrspace(3) %219, align 16
+ %3008 = shufflevector <2 x half> %2227, <2 x half> %2291, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3009 = shufflevector <2 x half> %2292, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3010 = shufflevector <8 x half> %3008, <8 x half> %3009, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %3011 = shufflevector <2 x half> %2228, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3012 = shufflevector <8 x half> %3010, <8 x half> %3011, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %3012, ptr addrspace(3) %220, align 16
+ %3013 = shufflevector <2 x half> %2229, <2 x half> %2293, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3014 = shufflevector <2 x half> %2294, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3015 = shufflevector <8 x half> %3013, <8 x half> %3014, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %3016 = shufflevector <2 x half> %2230, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3017 = shufflevector <8 x half> %3015, <8 x half> %3016, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %3017, ptr addrspace(3) %221, align 16
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %3018 = load <8 x half>, ptr addrspace(3) %243, align 16
+ %3019 = load <8 x half>, ptr addrspace(3) %245, align 16
+ %3020 = load <8 x half>, ptr addrspace(3) %233, align 16
+ %3021 = load <8 x half>, ptr addrspace(3) %235, align 16
+ %3022 = shufflevector <8 x half> %3020, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3023 = shufflevector <8 x half> %3020, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3024 = shufflevector <8 x half> %3021, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3025 = shufflevector <8 x half> %3021, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3026 = shufflevector <8 x half> %3018, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3027 = shufflevector <8 x half> %3018, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3028 = shufflevector <8 x half> %3019, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3029 = shufflevector <8 x half> %3019, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3030 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3022, <4 x float> %2762, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3031 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3023, <4 x float> %3030, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3032 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3022, <4 x float> %2764, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3033 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3023, <4 x float> %3032, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3034 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3024, <4 x float> %2766, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3035 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3025, <4 x float> %3034, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3036 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3024, <4 x float> %2768, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3037 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3025, <4 x float> %3036, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3038 = load <8 x half>, ptr addrspace(3) %258, align 16
+ %3039 = load <8 x half>, ptr addrspace(3) %260, align 16
+ %3040 = shufflevector <8 x half> %3038, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3041 = shufflevector <8 x half> %3038, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3042 = shufflevector <8 x half> %3039, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3043 = shufflevector <8 x half> %3039, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3044 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3022, <4 x float> %2779, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3045 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3023, <4 x float> %3044, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3046 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3022, <4 x float> %2781, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3047 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3023, <4 x float> %3046, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3048 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3024, <4 x float> %2783, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3049 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3025, <4 x float> %3048, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3050 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3024, <4 x float> %2785, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3051 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3025, <4 x float> %3050, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3052 = load <8 x half>, ptr addrspace(3) %2423, align 16
+ %3053 = load <8 x half>, ptr addrspace(3) %2425, align 16
+ %3054 = shufflevector <8 x half> %3052, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3055 = shufflevector <8 x half> %3052, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3056 = shufflevector <8 x half> %3053, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3057 = shufflevector <8 x half> %3053, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3058 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3022, <4 x float> %2819, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3059 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3023, <4 x float> %3058, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3060 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3022, <4 x float> %2821, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3061 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3023, <4 x float> %3060, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3062 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3024, <4 x float> %2823, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3063 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3025, <4 x float> %3062, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3064 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3024, <4 x float> %2825, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3065 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3025, <4 x float> %3064, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3066 = load <8 x half>, ptr addrspace(3) %2456, align 16
+ %3067 = load <8 x half>, ptr addrspace(3) %2458, align 16
+ %3068 = shufflevector <8 x half> %3066, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3069 = shufflevector <8 x half> %3066, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3070 = shufflevector <8 x half> %3067, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3071 = shufflevector <8 x half> %3067, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3072 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3022, <4 x float> %2835, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3073 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3023, <4 x float> %3072, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3074 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3022, <4 x float> %2837, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3075 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3023, <4 x float> %3074, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3076 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3024, <4 x float> %2839, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3077 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3025, <4 x float> %3076, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3078 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3024, <4 x float> %2841, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3079 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3025, <4 x float> %3078, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %3080 = load <8 x half>, ptr addrspace(3) %251, align 16
+ %3081 = load <8 x half>, ptr addrspace(3) %253, align 16
+ %3082 = shufflevector <8 x half> %3080, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3083 = shufflevector <8 x half> %3080, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3084 = shufflevector <8 x half> %3081, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3085 = shufflevector <8 x half> %3081, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3086 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3082, <4 x float> %2795, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3087 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3083, <4 x float> %3086, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3088 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3082, <4 x float> %2797, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3089 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3083, <4 x float> %3088, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3090 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3084, <4 x float> %2799, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3091 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3085, <4 x float> %3090, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3092 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3084, <4 x float> %2801, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3093 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3085, <4 x float> %3092, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3094 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3082, <4 x float> %2803, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3095 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3083, <4 x float> %3094, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3096 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3082, <4 x float> %2805, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3097 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3083, <4 x float> %3096, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3098 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3084, <4 x float> %2807, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3099 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3085, <4 x float> %3098, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3100 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3084, <4 x float> %2809, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3101 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3085, <4 x float> %3100, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3102 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3082, <4 x float> %2843, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3103 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3083, <4 x float> %3102, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3104 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3082, <4 x float> %2845, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3105 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3083, <4 x float> %3104, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3106 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3084, <4 x float> %2847, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3107 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3085, <4 x float> %3106, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3108 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3084, <4 x float> %2849, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3109 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3085, <4 x float> %3108, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3110 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3082, <4 x float> %2851, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3111 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3083, <4 x float> %3110, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3112 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3082, <4 x float> %2853, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3113 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3083, <4 x float> %3112, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3114 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3084, <4 x float> %2855, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3115 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3085, <4 x float> %3114, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3116 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3084, <4 x float> %2857, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3117 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3085, <4 x float> %3116, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %3118 = load <8 x half>, ptr addrspace(3) %2536, align 16
+ %3119 = load <8 x half>, ptr addrspace(3) %2538, align 16
+ %3120 = shufflevector <8 x half> %3118, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3121 = shufflevector <8 x half> %3118, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3122 = shufflevector <8 x half> %3119, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3123 = shufflevector <8 x half> %3119, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3124 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3120, <4 x float> %2867, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3125 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3121, <4 x float> %3124, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3126 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3120, <4 x float> %2869, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3127 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3121, <4 x float> %3126, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3128 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3122, <4 x float> %2871, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3129 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3123, <4 x float> %3128, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3130 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3122, <4 x float> %2873, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3131 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3123, <4 x float> %3130, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3132 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3120, <4 x float> %2875, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3133 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3121, <4 x float> %3132, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3134 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3120, <4 x float> %2877, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3135 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3121, <4 x float> %3134, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3136 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3122, <4 x float> %2879, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3137 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3123, <4 x float> %3136, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3138 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3122, <4 x float> %2881, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3139 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3123, <4 x float> %3138, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3140 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3120, <4 x float> %2907, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3141 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3121, <4 x float> %3140, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3142 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3120, <4 x float> %2909, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3143 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3121, <4 x float> %3142, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3144 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3122, <4 x float> %2911, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3145 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3123, <4 x float> %3144, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3146 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3122, <4 x float> %2913, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3147 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3123, <4 x float> %3146, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3148 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3120, <4 x float> %2915, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3149 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3121, <4 x float> %3148, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3150 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3120, <4 x float> %2917, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3151 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3121, <4 x float> %3150, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3152 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3122, <4 x float> %2919, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3153 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3123, <4 x float> %3152, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3154 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3122, <4 x float> %2921, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3155 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3123, <4 x float> %3154, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %3156 = load <8 x half>, ptr addrspace(3) %2592, align 16
+ %3157 = load <8 x half>, ptr addrspace(3) %2594, align 16
+ %3158 = shufflevector <8 x half> %3156, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3159 = shufflevector <8 x half> %3156, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3160 = shufflevector <8 x half> %3157, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3161 = shufflevector <8 x half> %3157, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3162 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3158, <4 x float> %2891, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3163 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3159, <4 x float> %3162, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3164 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3158, <4 x float> %2893, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3165 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3159, <4 x float> %3164, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3166 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3160, <4 x float> %2895, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3167 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3161, <4 x float> %3166, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3168 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3160, <4 x float> %2897, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3169 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3161, <4 x float> %3168, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3170 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3158, <4 x float> %2899, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3171 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3159, <4 x float> %3170, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3172 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3158, <4 x float> %2901, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3173 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3159, <4 x float> %3172, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3174 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3160, <4 x float> %2903, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3175 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3161, <4 x float> %3174, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3176 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3160, <4 x float> %2905, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3177 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3161, <4 x float> %3176, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3178 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3158, <4 x float> %2923, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3179 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3159, <4 x float> %3178, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3180 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3158, <4 x float> %2925, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3181 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3159, <4 x float> %3180, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3182 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3160, <4 x float> %2927, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3183 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3161, <4 x float> %3182, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3184 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3160, <4 x float> %2929, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3185 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3161, <4 x float> %3184, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3186 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3158, <4 x float> %2931, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3187 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3159, <4 x float> %3186, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3188 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3158, <4 x float> %2933, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3189 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3159, <4 x float> %3188, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3190 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3160, <4 x float> %2935, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3191 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3161, <4 x float> %3190, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3192 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3160, <4 x float> %2937, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3193 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3161, <4 x float> %3192, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 0)
+ %3194 = load <8 x half>, ptr addrspace(3) %2745, align 16
+ %3195 = load <8 x half>, ptr addrspace(3) %2747, align 16
+ %3196 = load <8 x half>, ptr addrspace(3) %2749, align 16
+ %3197 = load <8 x half>, ptr addrspace(3) %2751, align 16
+ %3198 = shufflevector <8 x half> %3196, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3199 = shufflevector <8 x half> %3196, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3200 = shufflevector <8 x half> %3197, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3201 = shufflevector <8 x half> %3197, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3202 = shufflevector <8 x half> %3194, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3203 = shufflevector <8 x half> %3194, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3204 = shufflevector <8 x half> %3195, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3205 = shufflevector <8 x half> %3195, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3206 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3198, <4 x float> %3031, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3207 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3199, <4 x float> %3206, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3208 = extractelement <4 x float> %3207, i64 0
+ %3209 = extractelement <4 x float> %3207, i64 1
+ %3210 = extractelement <4 x float> %3207, i64 2
+ %3211 = extractelement <4 x float> %3207, i64 3
+ %3212 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3198, <4 x float> %3033, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3213 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3199, <4 x float> %3212, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3214 = extractelement <4 x float> %3213, i64 0
+ %3215 = extractelement <4 x float> %3213, i64 1
+ %3216 = extractelement <4 x float> %3213, i64 2
+ %3217 = extractelement <4 x float> %3213, i64 3
+ %3218 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3200, <4 x float> %3035, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3219 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3201, <4 x float> %3218, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3220 = extractelement <4 x float> %3219, i64 0
+ %3221 = extractelement <4 x float> %3219, i64 1
+ %3222 = extractelement <4 x float> %3219, i64 2
+ %3223 = extractelement <4 x float> %3219, i64 3
+ %3224 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3200, <4 x float> %3037, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3225 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3201, <4 x float> %3224, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3226 = extractelement <4 x float> %3225, i64 0
+ %3227 = extractelement <4 x float> %3225, i64 1
+ %3228 = extractelement <4 x float> %3225, i64 2
+ %3229 = extractelement <4 x float> %3225, i64 3
+ %3230 = load <8 x half>, ptr addrspace(3) %2770, align 16
+ %3231 = load <8 x half>, ptr addrspace(3) %2772, align 16
+ %3232 = shufflevector <8 x half> %3230, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3233 = shufflevector <8 x half> %3230, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3234 = shufflevector <8 x half> %3231, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3235 = shufflevector <8 x half> %3231, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3236 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3198, <4 x float> %3045, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3237 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3199, <4 x float> %3236, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3238 = extractelement <4 x float> %3237, i64 0
+ %3239 = extractelement <4 x float> %3237, i64 1
+ %3240 = extractelement <4 x float> %3237, i64 2
+ %3241 = extractelement <4 x float> %3237, i64 3
+ %3242 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3198, <4 x float> %3047, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3243 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3199, <4 x float> %3242, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3244 = extractelement <4 x float> %3243, i64 0
+ %3245 = extractelement <4 x float> %3243, i64 1
+ %3246 = extractelement <4 x float> %3243, i64 2
+ %3247 = extractelement <4 x float> %3243, i64 3
+ %3248 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3200, <4 x float> %3049, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3249 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3201, <4 x float> %3248, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3250 = extractelement <4 x float> %3249, i64 0
+ %3251 = extractelement <4 x float> %3249, i64 1
+ %3252 = extractelement <4 x float> %3249, i64 2
+ %3253 = extractelement <4 x float> %3249, i64 3
+ %3254 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3200, <4 x float> %3051, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3255 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3201, <4 x float> %3254, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3256 = extractelement <4 x float> %3255, i64 0
+ %3257 = extractelement <4 x float> %3255, i64 1
+ %3258 = extractelement <4 x float> %3255, i64 2
+ %3259 = extractelement <4 x float> %3255, i64 3
+ %3260 = load <8 x half>, ptr addrspace(3) %2786, align 16
+ %3261 = load <8 x half>, ptr addrspace(3) %2788, align 16
+ %3262 = shufflevector <8 x half> %3260, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3263 = shufflevector <8 x half> %3260, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3264 = shufflevector <8 x half> %3261, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3265 = shufflevector <8 x half> %3261, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3266 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3262, <4 x float> %3087, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3267 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3263, <4 x float> %3266, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3268 = extractelement <4 x float> %3267, i64 0
+ %3269 = extractelement <4 x float> %3267, i64 1
+ %3270 = extractelement <4 x float> %3267, i64 2
+ %3271 = extractelement <4 x float> %3267, i64 3
+ %3272 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3262, <4 x float> %3089, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3273 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3263, <4 x float> %3272, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3274 = extractelement <4 x float> %3273, i64 0
+ %3275 = extractelement <4 x float> %3273, i64 1
+ %3276 = extractelement <4 x float> %3273, i64 2
+ %3277 = extractelement <4 x float> %3273, i64 3
+ %3278 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3264, <4 x float> %3091, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3279 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3265, <4 x float> %3278, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3280 = extractelement <4 x float> %3279, i64 0
+ %3281 = extractelement <4 x float> %3279, i64 1
+ %3282 = extractelement <4 x float> %3279, i64 2
+ %3283 = extractelement <4 x float> %3279, i64 3
+ %3284 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3264, <4 x float> %3093, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3285 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3265, <4 x float> %3284, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3286 = extractelement <4 x float> %3285, i64 0
+ %3287 = extractelement <4 x float> %3285, i64 1
+ %3288 = extractelement <4 x float> %3285, i64 2
+ %3289 = extractelement <4 x float> %3285, i64 3
+ %3290 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3262, <4 x float> %3095, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3291 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3263, <4 x float> %3290, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3292 = extractelement <4 x float> %3291, i64 0
+ %3293 = extractelement <4 x float> %3291, i64 1
+ %3294 = extractelement <4 x float> %3291, i64 2
+ %3295 = extractelement <4 x float> %3291, i64 3
+ %3296 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3262, <4 x float> %3097, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3297 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3263, <4 x float> %3296, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3298 = extractelement <4 x float> %3297, i64 0
+ %3299 = extractelement <4 x float> %3297, i64 1
+ %3300 = extractelement <4 x float> %3297, i64 2
+ %3301 = extractelement <4 x float> %3297, i64 3
+ %3302 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3264, <4 x float> %3099, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3303 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3265, <4 x float> %3302, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3304 = extractelement <4 x float> %3303, i64 0
+ %3305 = extractelement <4 x float> %3303, i64 1
+ %3306 = extractelement <4 x float> %3303, i64 2
+ %3307 = extractelement <4 x float> %3303, i64 3
+ %3308 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3264, <4 x float> %3101, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3309 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3265, <4 x float> %3308, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3310 = extractelement <4 x float> %3309, i64 0
+ %3311 = extractelement <4 x float> %3309, i64 1
+ %3312 = extractelement <4 x float> %3309, i64 2
+ %3313 = extractelement <4 x float> %3309, i64 3
+ %3314 = load <8 x half>, ptr addrspace(3) %2810, align 16
+ %3315 = load <8 x half>, ptr addrspace(3) %2812, align 16
+ %3316 = shufflevector <8 x half> %3314, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3317 = shufflevector <8 x half> %3314, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3318 = shufflevector <8 x half> %3315, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3319 = shufflevector <8 x half> %3315, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3320 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3198, <4 x float> %3059, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3321 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3199, <4 x float> %3320, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3322 = extractelement <4 x float> %3321, i64 0
+ %3323 = extractelement <4 x float> %3321, i64 1
+ %3324 = extractelement <4 x float> %3321, i64 2
+ %3325 = extractelement <4 x float> %3321, i64 3
+ %3326 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3198, <4 x float> %3061, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3327 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3199, <4 x float> %3326, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3328 = extractelement <4 x float> %3327, i64 0
+ %3329 = extractelement <4 x float> %3327, i64 1
+ %3330 = extractelement <4 x float> %3327, i64 2
+ %3331 = extractelement <4 x float> %3327, i64 3
+ %3332 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3200, <4 x float> %3063, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3333 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3201, <4 x float> %3332, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3334 = extractelement <4 x float> %3333, i64 0
+ %3335 = extractelement <4 x float> %3333, i64 1
+ %3336 = extractelement <4 x float> %3333, i64 2
+ %3337 = extractelement <4 x float> %3333, i64 3
+ %3338 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3200, <4 x float> %3065, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3339 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3201, <4 x float> %3338, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3340 = extractelement <4 x float> %3339, i64 0
+ %3341 = extractelement <4 x float> %3339, i64 1
+ %3342 = extractelement <4 x float> %3339, i64 2
+ %3343 = extractelement <4 x float> %3339, i64 3
+ %3344 = load <8 x half>, ptr addrspace(3) %2826, align 16
+ %3345 = load <8 x half>, ptr addrspace(3) %2828, align 16
+ %3346 = shufflevector <8 x half> %3344, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3347 = shufflevector <8 x half> %3344, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3348 = shufflevector <8 x half> %3345, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3349 = shufflevector <8 x half> %3345, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3198, <4 x float> %3073, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3199, <4 x float> %3350, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3352 = extractelement <4 x float> %3351, i64 0
+ %3353 = extractelement <4 x float> %3351, i64 1
+ %3354 = extractelement <4 x float> %3351, i64 2
+ %3355 = extractelement <4 x float> %3351, i64 3
+ %3356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3198, <4 x float> %3075, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3199, <4 x float> %3356, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3358 = extractelement <4 x float> %3357, i64 0
+ %3359 = extractelement <4 x float> %3357, i64 1
+ %3360 = extractelement <4 x float> %3357, i64 2
+ %3361 = extractelement <4 x float> %3357, i64 3
+ %3362 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3200, <4 x float> %3077, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3363 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3201, <4 x float> %3362, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3364 = extractelement <4 x float> %3363, i64 0
+ %3365 = extractelement <4 x float> %3363, i64 1
+ %3366 = extractelement <4 x float> %3363, i64 2
+ %3367 = extractelement <4 x float> %3363, i64 3
+ %3368 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3200, <4 x float> %3079, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3369 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3201, <4 x float> %3368, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3370 = extractelement <4 x float> %3369, i64 0
+ %3371 = extractelement <4 x float> %3369, i64 1
+ %3372 = extractelement <4 x float> %3369, i64 2
+ %3373 = extractelement <4 x float> %3369, i64 3
+ %3374 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3262, <4 x float> %3103, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3375 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3263, <4 x float> %3374, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3376 = extractelement <4 x float> %3375, i64 0
+ %3377 = extractelement <4 x float> %3375, i64 1
+ %3378 = extractelement <4 x float> %3375, i64 2
+ %3379 = extractelement <4 x float> %3375, i64 3
+ %3380 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3262, <4 x float> %3105, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3381 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3263, <4 x float> %3380, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3382 = extractelement <4 x float> %3381, i64 0
+ %3383 = extractelement <4 x float> %3381, i64 1
+ %3384 = extractelement <4 x float> %3381, i64 2
+ %3385 = extractelement <4 x float> %3381, i64 3
+ %3386 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3264, <4 x float> %3107, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3387 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3265, <4 x float> %3386, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3388 = extractelement <4 x float> %3387, i64 0
+ %3389 = extractelement <4 x float> %3387, i64 1
+ %3390 = extractelement <4 x float> %3387, i64 2
+ %3391 = extractelement <4 x float> %3387, i64 3
+ %3392 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3264, <4 x float> %3109, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3393 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3265, <4 x float> %3392, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3394 = extractelement <4 x float> %3393, i64 0
+ %3395 = extractelement <4 x float> %3393, i64 1
+ %3396 = extractelement <4 x float> %3393, i64 2
+ %3397 = extractelement <4 x float> %3393, i64 3
+ %3398 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3262, <4 x float> %3111, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3399 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3263, <4 x float> %3398, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3400 = extractelement <4 x float> %3399, i64 0
+ %3401 = extractelement <4 x float> %3399, i64 1
+ %3402 = extractelement <4 x float> %3399, i64 2
+ %3403 = extractelement <4 x float> %3399, i64 3
+ %3404 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3262, <4 x float> %3113, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3405 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3263, <4 x float> %3404, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3406 = extractelement <4 x float> %3405, i64 0
+ %3407 = extractelement <4 x float> %3405, i64 1
+ %3408 = extractelement <4 x float> %3405, i64 2
+ %3409 = extractelement <4 x float> %3405, i64 3
+ %3410 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3264, <4 x float> %3115, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3411 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3265, <4 x float> %3410, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3412 = extractelement <4 x float> %3411, i64 0
+ %3413 = extractelement <4 x float> %3411, i64 1
+ %3414 = extractelement <4 x float> %3411, i64 2
+ %3415 = extractelement <4 x float> %3411, i64 3
+ %3416 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3264, <4 x float> %3117, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3417 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3265, <4 x float> %3416, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3418 = extractelement <4 x float> %3417, i64 0
+ %3419 = extractelement <4 x float> %3417, i64 1
+ %3420 = extractelement <4 x float> %3417, i64 2
+ %3421 = extractelement <4 x float> %3417, i64 3
+ %3422 = load <8 x half>, ptr addrspace(3) %2858, align 16
+ %3423 = load <8 x half>, ptr addrspace(3) %2860, align 16
+ %3424 = shufflevector <8 x half> %3422, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3425 = shufflevector <8 x half> %3422, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3426 = shufflevector <8 x half> %3423, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3427 = shufflevector <8 x half> %3423, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3428 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3424, <4 x float> %3125, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3429 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3425, <4 x float> %3428, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3430 = extractelement <4 x float> %3429, i64 0
+ %3431 = extractelement <4 x float> %3429, i64 1
+ %3432 = extractelement <4 x float> %3429, i64 2
+ %3433 = extractelement <4 x float> %3429, i64 3
+ %3434 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3424, <4 x float> %3127, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3435 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3425, <4 x float> %3434, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3436 = extractelement <4 x float> %3435, i64 0
+ %3437 = extractelement <4 x float> %3435, i64 1
+ %3438 = extractelement <4 x float> %3435, i64 2
+ %3439 = extractelement <4 x float> %3435, i64 3
+ %3440 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3426, <4 x float> %3129, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3441 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3427, <4 x float> %3440, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3442 = extractelement <4 x float> %3441, i64 0
+ %3443 = extractelement <4 x float> %3441, i64 1
+ %3444 = extractelement <4 x float> %3441, i64 2
+ %3445 = extractelement <4 x float> %3441, i64 3
+ %3446 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3426, <4 x float> %3131, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3447 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3427, <4 x float> %3446, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3448 = extractelement <4 x float> %3447, i64 0
+ %3449 = extractelement <4 x float> %3447, i64 1
+ %3450 = extractelement <4 x float> %3447, i64 2
+ %3451 = extractelement <4 x float> %3447, i64 3
+ %3452 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3424, <4 x float> %3133, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3453 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3425, <4 x float> %3452, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3454 = extractelement <4 x float> %3453, i64 0
+ %3455 = extractelement <4 x float> %3453, i64 1
+ %3456 = extractelement <4 x float> %3453, i64 2
+ %3457 = extractelement <4 x float> %3453, i64 3
+ %3458 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3424, <4 x float> %3135, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3459 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3425, <4 x float> %3458, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3460 = extractelement <4 x float> %3459, i64 0
+ %3461 = extractelement <4 x float> %3459, i64 1
+ %3462 = extractelement <4 x float> %3459, i64 2
+ %3463 = extractelement <4 x float> %3459, i64 3
+ %3464 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3426, <4 x float> %3137, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3465 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3427, <4 x float> %3464, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3466 = extractelement <4 x float> %3465, i64 0
+ %3467 = extractelement <4 x float> %3465, i64 1
+ %3468 = extractelement <4 x float> %3465, i64 2
+ %3469 = extractelement <4 x float> %3465, i64 3
+ %3470 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3426, <4 x float> %3139, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3471 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3427, <4 x float> %3470, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3472 = extractelement <4 x float> %3471, i64 0
+ %3473 = extractelement <4 x float> %3471, i64 1
+ %3474 = extractelement <4 x float> %3471, i64 2
+ %3475 = extractelement <4 x float> %3471, i64 3
+ %3476 = load <8 x half>, ptr addrspace(3) %2882, align 16
+ %3477 = load <8 x half>, ptr addrspace(3) %2884, align 16
+ %3478 = shufflevector <8 x half> %3476, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3479 = shufflevector <8 x half> %3476, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3480 = shufflevector <8 x half> %3477, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3481 = shufflevector <8 x half> %3477, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3482 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3478, <4 x float> %3163, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3483 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3479, <4 x float> %3482, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3484 = extractelement <4 x float> %3483, i64 0
+ %3485 = extractelement <4 x float> %3483, i64 1
+ %3486 = extractelement <4 x float> %3483, i64 2
+ %3487 = extractelement <4 x float> %3483, i64 3
+ %3488 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3478, <4 x float> %3165, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3489 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3479, <4 x float> %3488, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3490 = extractelement <4 x float> %3489, i64 0
+ %3491 = extractelement <4 x float> %3489, i64 1
+ %3492 = extractelement <4 x float> %3489, i64 2
+ %3493 = extractelement <4 x float> %3489, i64 3
+ %3494 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3480, <4 x float> %3167, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3495 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3481, <4 x float> %3494, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3496 = extractelement <4 x float> %3495, i64 0
+ %3497 = extractelement <4 x float> %3495, i64 1
+ %3498 = extractelement <4 x float> %3495, i64 2
+ %3499 = extractelement <4 x float> %3495, i64 3
+ %3500 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3480, <4 x float> %3169, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3501 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3481, <4 x float> %3500, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3502 = extractelement <4 x float> %3501, i64 0
+ %3503 = extractelement <4 x float> %3501, i64 1
+ %3504 = extractelement <4 x float> %3501, i64 2
+ %3505 = extractelement <4 x float> %3501, i64 3
+ %3506 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3478, <4 x float> %3171, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3507 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3479, <4 x float> %3506, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3508 = extractelement <4 x float> %3507, i64 0
+ %3509 = extractelement <4 x float> %3507, i64 1
+ %3510 = extractelement <4 x float> %3507, i64 2
+ %3511 = extractelement <4 x float> %3507, i64 3
+ %3512 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3478, <4 x float> %3173, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3513 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3479, <4 x float> %3512, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3514 = extractelement <4 x float> %3513, i64 0
+ %3515 = extractelement <4 x float> %3513, i64 1
+ %3516 = extractelement <4 x float> %3513, i64 2
+ %3517 = extractelement <4 x float> %3513, i64 3
+ %3518 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3480, <4 x float> %3175, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3519 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3481, <4 x float> %3518, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3520 = extractelement <4 x float> %3519, i64 0
+ %3521 = extractelement <4 x float> %3519, i64 1
+ %3522 = extractelement <4 x float> %3519, i64 2
+ %3523 = extractelement <4 x float> %3519, i64 3
+ %3524 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3480, <4 x float> %3177, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3525 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3481, <4 x float> %3524, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3526 = extractelement <4 x float> %3525, i64 0
+ %3527 = extractelement <4 x float> %3525, i64 1
+ %3528 = extractelement <4 x float> %3525, i64 2
+ %3529 = extractelement <4 x float> %3525, i64 3
+ %3530 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3424, <4 x float> %3141, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3531 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3425, <4 x float> %3530, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3532 = extractelement <4 x float> %3531, i64 0
+ %3533 = extractelement <4 x float> %3531, i64 1
+ %3534 = extractelement <4 x float> %3531, i64 2
+ %3535 = extractelement <4 x float> %3531, i64 3
+ %3536 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3424, <4 x float> %3143, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3537 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3425, <4 x float> %3536, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3538 = extractelement <4 x float> %3537, i64 0
+ %3539 = extractelement <4 x float> %3537, i64 1
+ %3540 = extractelement <4 x float> %3537, i64 2
+ %3541 = extractelement <4 x float> %3537, i64 3
+ %3542 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3426, <4 x float> %3145, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3543 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3427, <4 x float> %3542, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3544 = extractelement <4 x float> %3543, i64 0
+ %3545 = extractelement <4 x float> %3543, i64 1
+ %3546 = extractelement <4 x float> %3543, i64 2
+ %3547 = extractelement <4 x float> %3543, i64 3
+ %3548 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3426, <4 x float> %3147, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3549 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3427, <4 x float> %3548, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3550 = extractelement <4 x float> %3549, i64 0
+ %3551 = extractelement <4 x float> %3549, i64 1
+ %3552 = extractelement <4 x float> %3549, i64 2
+ %3553 = extractelement <4 x float> %3549, i64 3
+ %3554 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3424, <4 x float> %3149, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3555 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3425, <4 x float> %3554, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3556 = extractelement <4 x float> %3555, i64 0
+ %3557 = extractelement <4 x float> %3555, i64 1
+ %3558 = extractelement <4 x float> %3555, i64 2
+ %3559 = extractelement <4 x float> %3555, i64 3
+ %3560 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3424, <4 x float> %3151, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3561 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3425, <4 x float> %3560, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3562 = extractelement <4 x float> %3561, i64 0
+ %3563 = extractelement <4 x float> %3561, i64 1
+ %3564 = extractelement <4 x float> %3561, i64 2
+ %3565 = extractelement <4 x float> %3561, i64 3
+ %3566 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3426, <4 x float> %3153, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3567 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3427, <4 x float> %3566, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3568 = extractelement <4 x float> %3567, i64 0
+ %3569 = extractelement <4 x float> %3567, i64 1
+ %3570 = extractelement <4 x float> %3567, i64 2
+ %3571 = extractelement <4 x float> %3567, i64 3
+ %3572 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3426, <4 x float> %3155, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3573 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3427, <4 x float> %3572, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3574 = extractelement <4 x float> %3573, i64 0
+ %3575 = extractelement <4 x float> %3573, i64 1
+ %3576 = extractelement <4 x float> %3573, i64 2
+ %3577 = extractelement <4 x float> %3573, i64 3
+ %3578 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3478, <4 x float> %3179, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3579 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3479, <4 x float> %3578, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3580 = extractelement <4 x float> %3579, i64 0
+ %3581 = extractelement <4 x float> %3579, i64 1
+ %3582 = extractelement <4 x float> %3579, i64 2
+ %3583 = extractelement <4 x float> %3579, i64 3
+ %3584 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3478, <4 x float> %3181, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3585 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3479, <4 x float> %3584, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3586 = extractelement <4 x float> %3585, i64 0
+ %3587 = extractelement <4 x float> %3585, i64 1
+ %3588 = extractelement <4 x float> %3585, i64 2
+ %3589 = extractelement <4 x float> %3585, i64 3
+ %3590 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3480, <4 x float> %3183, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3591 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3481, <4 x float> %3590, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3592 = extractelement <4 x float> %3591, i64 0
+ %3593 = extractelement <4 x float> %3591, i64 1
+ %3594 = extractelement <4 x float> %3591, i64 2
+ %3595 = extractelement <4 x float> %3591, i64 3
+ %3596 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3480, <4 x float> %3185, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3597 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3481, <4 x float> %3596, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3598 = extractelement <4 x float> %3597, i64 0
+ %3599 = extractelement <4 x float> %3597, i64 1
+ %3600 = extractelement <4 x float> %3597, i64 2
+ %3601 = extractelement <4 x float> %3597, i64 3
+ %3602 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3478, <4 x float> %3187, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3603 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3479, <4 x float> %3602, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3604 = extractelement <4 x float> %3603, i64 0
+ %3605 = extractelement <4 x float> %3603, i64 1
+ %3606 = extractelement <4 x float> %3603, i64 2
+ %3607 = extractelement <4 x float> %3603, i64 3
+ %3608 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3478, <4 x float> %3189, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3609 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3479, <4 x float> %3608, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3610 = extractelement <4 x float> %3609, i64 0
+ %3611 = extractelement <4 x float> %3609, i64 1
+ %3612 = extractelement <4 x float> %3609, i64 2
+ %3613 = extractelement <4 x float> %3609, i64 3
+ %3614 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3480, <4 x float> %3191, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3615 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3481, <4 x float> %3614, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3616 = extractelement <4 x float> %3615, i64 0
+ %3617 = extractelement <4 x float> %3615, i64 1
+ %3618 = extractelement <4 x float> %3615, i64 2
+ %3619 = extractelement <4 x float> %3615, i64 3
+ %3620 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3480, <4 x float> %3193, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ %3621 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3481, <4 x float> %3620, i32 0, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 2038)
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030)
+ %3622 = extractelement <4 x float> %3621, i64 0
+ %3623 = extractelement <4 x float> %3621, i64 1
+ %3624 = extractelement <4 x float> %3621, i64 2
+ %3625 = extractelement <4 x float> %3621, i64 3
+ %3626 = mul i32 %31, %9
+ %3627 = sext i32 %3626 to i64
+ %3628 = getelementptr half, ptr addrspace(1) %2, i64 %3627
+ %3629 = sext i32 %118 to i64
+ %3630 = getelementptr half, ptr addrspace(1) %3628, i64 %3629
+ %3631 = mul i32 %9, %225
+ %3632 = mul i32 %9, %2309
+ %3633 = mul i32 %9, %2308
+ %3634 = mul i32 %9, %2307
+ %3635 = mul i32 %9, %2306
+ %3636 = mul i32 %9, %2305
+ %3637 = mul i32 %9, %2304
+ %3638 = mul i32 %9, %2303
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030)
+ %3639 = add i32 %3631, %2295
+ %3640 = add i32 %3631, %2302
+ %3641 = add i32 %3632, %2295
+ %3642 = add i32 %3632, %2302
+ %3643 = fptrunc float %3208 to half
+ %3644 = fptrunc float %3209 to half
+ %3645 = fptrunc float %3210 to half
+ %3646 = fptrunc float %3211 to half
+ %3647 = fptrunc float %3214 to half
+ %3648 = fptrunc float %3215 to half
+ %3649 = fptrunc float %3216 to half
+ %3650 = fptrunc float %3217 to half
+ %3651 = fptrunc float %3220 to half
+ %3652 = fptrunc float %3221 to half
+ %3653 = fptrunc float %3222 to half
+ %3654 = fptrunc float %3223 to half
+ %3655 = fptrunc float %3226 to half
+ %3656 = fptrunc float %3227 to half
+ %3657 = fptrunc float %3228 to half
+ %3658 = fptrunc float %3229 to half
+ %3659 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %3630, i16 0, i32 2147483646, i32 159744)
+ %3660 = insertelement <4 x half> poison, half %3643, i64 0
+ %3661 = insertelement <4 x half> %3660, half %3644, i64 1
+ %3662 = insertelement <4 x half> %3661, half %3645, i64 2
+ %3663 = insertelement <4 x half> %3662, half %3646, i64 3
+ %3664 = bitcast <4 x half> %3663 to <2 x i32>
+ %3665 = shl i32 %3639, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3664, ptr addrspace(8) %3659, i32 %3665, i32 0, i32 0)
+ %3666 = insertelement <4 x half> poison, half %3647, i64 0
+ %3667 = insertelement <4 x half> %3666, half %3648, i64 1
+ %3668 = insertelement <4 x half> %3667, half %3649, i64 2
+ %3669 = insertelement <4 x half> %3668, half %3650, i64 3
+ %3670 = bitcast <4 x half> %3669 to <2 x i32>
+ %3671 = shl i32 %3640, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3670, ptr addrspace(8) %3659, i32 %3671, i32 0, i32 0)
+ %3672 = insertelement <4 x half> poison, half %3651, i64 0
+ %3673 = insertelement <4 x half> %3672, half %3652, i64 1
+ %3674 = insertelement <4 x half> %3673, half %3653, i64 2
+ %3675 = insertelement <4 x half> %3674, half %3654, i64 3
+ %3676 = bitcast <4 x half> %3675 to <2 x i32>
+ %3677 = shl i32 %3641, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3676, ptr addrspace(8) %3659, i32 %3677, i32 0, i32 0)
+ %3678 = insertelement <4 x half> poison, half %3655, i64 0
+ %3679 = insertelement <4 x half> %3678, half %3656, i64 1
+ %3680 = insertelement <4 x half> %3679, half %3657, i64 2
+ %3681 = insertelement <4 x half> %3680, half %3658, i64 3
+ %3682 = bitcast <4 x half> %3681 to <2 x i32>
+ %3683 = shl i32 %3642, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3682, ptr addrspace(8) %3659, i32 %3683, i32 0, i32 0)
+ %3684 = add i32 %3631, %2301
+ %3685 = add i32 %3631, %2300
+ %3686 = add i32 %3632, %2301
+ %3687 = add i32 %3632, %2300
+ %3688 = fptrunc float %3238 to half
+ %3689 = fptrunc float %3239 to half
+ %3690 = fptrunc float %3240 to half
+ %3691 = fptrunc float %3241 to half
+ %3692 = fptrunc float %3244 to half
+ %3693 = fptrunc float %3245 to half
+ %3694 = fptrunc float %3246 to half
+ %3695 = fptrunc float %3247 to half
+ %3696 = fptrunc float %3250 to half
+ %3697 = fptrunc float %3251 to half
+ %3698 = fptrunc float %3252 to half
+ %3699 = fptrunc float %3253 to half
+ %3700 = fptrunc float %3256 to half
+ %3701 = fptrunc float %3257 to half
+ %3702 = fptrunc float %3258 to half
+ %3703 = fptrunc float %3259 to half
+ %3704 = insertelement <4 x half> poison, half %3688, i64 0
+ %3705 = insertelement <4 x half> %3704, half %3689, i64 1
+ %3706 = insertelement <4 x half> %3705, half %3690, i64 2
+ %3707 = insertelement <4 x half> %3706, half %3691, i64 3
+ %3708 = bitcast <4 x half> %3707 to <2 x i32>
+ %3709 = shl i32 %3684, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3708, ptr addrspace(8) %3659, i32 %3709, i32 0, i32 0)
+ %3710 = insertelement <4 x half> poison, half %3692, i64 0
+ %3711 = insertelement <4 x half> %3710, half %3693, i64 1
+ %3712 = insertelement <4 x half> %3711, half %3694, i64 2
+ %3713 = insertelement <4 x half> %3712, half %3695, i64 3
+ %3714 = bitcast <4 x half> %3713 to <2 x i32>
+ %3715 = shl i32 %3685, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3714, ptr addrspace(8) %3659, i32 %3715, i32 0, i32 0)
+ %3716 = insertelement <4 x half> poison, half %3696, i64 0
+ %3717 = insertelement <4 x half> %3716, half %3697, i64 1
+ %3718 = insertelement <4 x half> %3717, half %3698, i64 2
+ %3719 = insertelement <4 x half> %3718, half %3699, i64 3
+ %3720 = bitcast <4 x half> %3719 to <2 x i32>
+ %3721 = shl i32 %3686, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3720, ptr addrspace(8) %3659, i32 %3721, i32 0, i32 0)
+ %3722 = insertelement <4 x half> poison, half %3700, i64 0
+ %3723 = insertelement <4 x half> %3722, half %3701, i64 1
+ %3724 = insertelement <4 x half> %3723, half %3702, i64 2
+ %3725 = insertelement <4 x half> %3724, half %3703, i64 3
+ %3726 = bitcast <4 x half> %3725 to <2 x i32>
+ %3727 = shl i32 %3687, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3726, ptr addrspace(8) %3659, i32 %3727, i32 0, i32 0)
+ %3728 = add i32 %3631, %2299
+ %3729 = add i32 %3631, %2298
+ %3730 = add i32 %3632, %2299
+ %3731 = add i32 %3632, %2298
+ %3732 = fptrunc float %3322 to half
+ %3733 = fptrunc float %3323 to half
+ %3734 = fptrunc float %3324 to half
+ %3735 = fptrunc float %3325 to half
+ %3736 = fptrunc float %3328 to half
+ %3737 = fptrunc float %3329 to half
+ %3738 = fptrunc float %3330 to half
+ %3739 = fptrunc float %3331 to half
+ %3740 = fptrunc float %3334 to half
+ %3741 = fptrunc float %3335 to half
+ %3742 = fptrunc float %3336 to half
+ %3743 = fptrunc float %3337 to half
+ %3744 = fptrunc float %3340 to half
+ %3745 = fptrunc float %3341 to half
+ %3746 = fptrunc float %3342 to half
+ %3747 = fptrunc float %3343 to half
+ %3748 = insertelement <4 x half> poison, half %3732, i64 0
+ %3749 = insertelement <4 x half> %3748, half %3733, i64 1
+ %3750 = insertelement <4 x half> %3749, half %3734, i64 2
+ %3751 = insertelement <4 x half> %3750, half %3735, i64 3
+ %3752 = bitcast <4 x half> %3751 to <2 x i32>
+ %3753 = shl i32 %3728, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3752, ptr addrspace(8) %3659, i32 %3753, i32 0, i32 0)
+ %3754 = insertelement <4 x half> poison, half %3736, i64 0
+ %3755 = insertelement <4 x half> %3754, half %3737, i64 1
+ %3756 = insertelement <4 x half> %3755, half %3738, i64 2
+ %3757 = insertelement <4 x half> %3756, half %3739, i64 3
+ %3758 = bitcast <4 x half> %3757 to <2 x i32>
+ %3759 = shl i32 %3729, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3758, ptr addrspace(8) %3659, i32 %3759, i32 0, i32 0)
+ %3760 = insertelement <4 x half> poison, half %3740, i64 0
+ %3761 = insertelement <4 x half> %3760, half %3741, i64 1
+ %3762 = insertelement <4 x half> %3761, half %3742, i64 2
+ %3763 = insertelement <4 x half> %3762, half %3743, i64 3
+ %3764 = bitcast <4 x half> %3763 to <2 x i32>
+ %3765 = shl i32 %3730, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3764, ptr addrspace(8) %3659, i32 %3765, i32 0, i32 0)
+ %3766 = insertelement <4 x half> poison, half %3744, i64 0
+ %3767 = insertelement <4 x half> %3766, half %3745, i64 1
+ %3768 = insertelement <4 x half> %3767, half %3746, i64 2
+ %3769 = insertelement <4 x half> %3768, half %3747, i64 3
+ %3770 = bitcast <4 x half> %3769 to <2 x i32>
+ %3771 = shl i32 %3731, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3770, ptr addrspace(8) %3659, i32 %3771, i32 0, i32 0)
+ %3772 = add i32 %3631, %2297
+ %3773 = add i32 %3631, %2296
+ %3774 = add i32 %3632, %2297
+ %3775 = add i32 %3632, %2296
+ %3776 = fptrunc float %3352 to half
+ %3777 = fptrunc float %3353 to half
+ %3778 = fptrunc float %3354 to half
+ %3779 = fptrunc float %3355 to half
+ %3780 = fptrunc float %3358 to half
+ %3781 = fptrunc float %3359 to half
+ %3782 = fptrunc float %3360 to half
+ %3783 = fptrunc float %3361 to half
+ %3784 = fptrunc float %3364 to half
+ %3785 = fptrunc float %3365 to half
+ %3786 = fptrunc float %3366 to half
+ %3787 = fptrunc float %3367 to half
+ %3788 = fptrunc float %3370 to half
+ %3789 = fptrunc float %3371 to half
+ %3790 = fptrunc float %3372 to half
+ %3791 = fptrunc float %3373 to half
+ %3792 = insertelement <4 x half> poison, half %3776, i64 0
+ %3793 = insertelement <4 x half> %3792, half %3777, i64 1
+ %3794 = insertelement <4 x half> %3793, half %3778, i64 2
+ %3795 = insertelement <4 x half> %3794, half %3779, i64 3
+ %3796 = bitcast <4 x half> %3795 to <2 x i32>
+ %3797 = shl i32 %3772, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3796, ptr addrspace(8) %3659, i32 %3797, i32 0, i32 0)
+ %3798 = insertelement <4 x half> poison, half %3780, i64 0
+ %3799 = insertelement <4 x half> %3798, half %3781, i64 1
+ %3800 = insertelement <4 x half> %3799, half %3782, i64 2
+ %3801 = insertelement <4 x half> %3800, half %3783, i64 3
+ %3802 = bitcast <4 x half> %3801 to <2 x i32>
+ %3803 = shl i32 %3773, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3802, ptr addrspace(8) %3659, i32 %3803, i32 0, i32 0)
+ %3804 = insertelement <4 x half> poison, half %3784, i64 0
+ %3805 = insertelement <4 x half> %3804, half %3785, i64 1
+ %3806 = insertelement <4 x half> %3805, half %3786, i64 2
+ %3807 = insertelement <4 x half> %3806, half %3787, i64 3
+ %3808 = bitcast <4 x half> %3807 to <2 x i32>
+ %3809 = shl i32 %3774, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3808, ptr addrspace(8) %3659, i32 %3809, i32 0, i32 0)
+ %3810 = insertelement <4 x half> poison, half %3788, i64 0
+ %3811 = insertelement <4 x half> %3810, half %3789, i64 1
+ %3812 = insertelement <4 x half> %3811, half %3790, i64 2
+ %3813 = insertelement <4 x half> %3812, half %3791, i64 3
+ %3814 = bitcast <4 x half> %3813 to <2 x i32>
+ %3815 = shl i32 %3775, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3814, ptr addrspace(8) %3659, i32 %3815, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030)
+ %3816 = add i32 %3633, %2295
+ %3817 = add i32 %3633, %2302
+ %3818 = add i32 %3634, %2295
+ %3819 = add i32 %3634, %2302
+ %3820 = fptrunc float %3268 to half
+ %3821 = fptrunc float %3269 to half
+ %3822 = fptrunc float %3270 to half
+ %3823 = fptrunc float %3271 to half
+ %3824 = fptrunc float %3274 to half
+ %3825 = fptrunc float %3275 to half
+ %3826 = fptrunc float %3276 to half
+ %3827 = fptrunc float %3277 to half
+ %3828 = fptrunc float %3280 to half
+ %3829 = fptrunc float %3281 to half
+ %3830 = fptrunc float %3282 to half
+ %3831 = fptrunc float %3283 to half
+ %3832 = fptrunc float %3286 to half
+ %3833 = fptrunc float %3287 to half
+ %3834 = fptrunc float %3288 to half
+ %3835 = fptrunc float %3289 to half
+ %3836 = insertelement <4 x half> poison, half %3820, i64 0
+ %3837 = insertelement <4 x half> %3836, half %3821, i64 1
+ %3838 = insertelement <4 x half> %3837, half %3822, i64 2
+ %3839 = insertelement <4 x half> %3838, half %3823, i64 3
+ %3840 = bitcast <4 x half> %3839 to <2 x i32>
+ %3841 = shl i32 %3816, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3840, ptr addrspace(8) %3659, i32 %3841, i32 0, i32 0)
+ %3842 = insertelement <4 x half> poison, half %3824, i64 0
+ %3843 = insertelement <4 x half> %3842, half %3825, i64 1
+ %3844 = insertelement <4 x half> %3843, half %3826, i64 2
+ %3845 = insertelement <4 x half> %3844, half %3827, i64 3
+ %3846 = bitcast <4 x half> %3845 to <2 x i32>
+ %3847 = shl i32 %3817, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3846, ptr addrspace(8) %3659, i32 %3847, i32 0, i32 0)
+ %3848 = insertelement <4 x half> poison, half %3828, i64 0
+ %3849 = insertelement <4 x half> %3848, half %3829, i64 1
+ %3850 = insertelement <4 x half> %3849, half %3830, i64 2
+ %3851 = insertelement <4 x half> %3850, half %3831, i64 3
+ %3852 = bitcast <4 x half> %3851 to <2 x i32>
+ %3853 = shl i32 %3818, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3852, ptr addrspace(8) %3659, i32 %3853, i32 0, i32 0)
+ %3854 = insertelement <4 x half> poison, half %3832, i64 0
+ %3855 = insertelement <4 x half> %3854, half %3833, i64 1
+ %3856 = insertelement <4 x half> %3855, half %3834, i64 2
+ %3857 = insertelement <4 x half> %3856, half %3835, i64 3
+ %3858 = bitcast <4 x half> %3857 to <2 x i32>
+ %3859 = shl i32 %3819, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3858, ptr addrspace(8) %3659, i32 %3859, i32 0, i32 0)
+ %3860 = add i32 %3633, %2301
+ %3861 = add i32 %3633, %2300
+ %3862 = add i32 %3634, %2301
+ %3863 = add i32 %3634, %2300
+ %3864 = fptrunc float %3292 to half
+ %3865 = fptrunc float %3293 to half
+ %3866 = fptrunc float %3294 to half
+ %3867 = fptrunc float %3295 to half
+ %3868 = fptrunc float %3298 to half
+ %3869 = fptrunc float %3299 to half
+ %3870 = fptrunc float %3300 to half
+ %3871 = fptrunc float %3301 to half
+ %3872 = fptrunc float %3304 to half
+ %3873 = fptrunc float %3305 to half
+ %3874 = fptrunc float %3306 to half
+ %3875 = fptrunc float %3307 to half
+ %3876 = fptrunc float %3310 to half
+ %3877 = fptrunc float %3311 to half
+ %3878 = fptrunc float %3312 to half
+ %3879 = fptrunc float %3313 to half
+ %3880 = insertelement <4 x half> poison, half %3864, i64 0
+ %3881 = insertelement <4 x half> %3880, half %3865, i64 1
+ %3882 = insertelement <4 x half> %3881, half %3866, i64 2
+ %3883 = insertelement <4 x half> %3882, half %3867, i64 3
+ %3884 = bitcast <4 x half> %3883 to <2 x i32>
+ %3885 = shl i32 %3860, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3884, ptr addrspace(8) %3659, i32 %3885, i32 0, i32 0)
+ %3886 = insertelement <4 x half> poison, half %3868, i64 0
+ %3887 = insertelement <4 x half> %3886, half %3869, i64 1
+ %3888 = insertelement <4 x half> %3887, half %3870, i64 2
+ %3889 = insertelement <4 x half> %3888, half %3871, i64 3
+ %3890 = bitcast <4 x half> %3889 to <2 x i32>
+ %3891 = shl i32 %3861, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3890, ptr addrspace(8) %3659, i32 %3891, i32 0, i32 0)
+ %3892 = insertelement <4 x half> poison, half %3872, i64 0
+ %3893 = insertelement <4 x half> %3892, half %3873, i64 1
+ %3894 = insertelement <4 x half> %3893, half %3874, i64 2
+ %3895 = insertelement <4 x half> %3894, half %3875, i64 3
+ %3896 = bitcast <4 x half> %3895 to <2 x i32>
+ %3897 = shl i32 %3862, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3896, ptr addrspace(8) %3659, i32 %3897, i32 0, i32 0)
+ %3898 = insertelement <4 x half> poison, half %3876, i64 0
+ %3899 = insertelement <4 x half> %3898, half %3877, i64 1
+ %3900 = insertelement <4 x half> %3899, half %3878, i64 2
+ %3901 = insertelement <4 x half> %3900, half %3879, i64 3
+ %3902 = bitcast <4 x half> %3901 to <2 x i32>
+ %3903 = shl i32 %3863, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3902, ptr addrspace(8) %3659, i32 %3903, i32 0, i32 0)
+ %3904 = add i32 %3633, %2299
+ %3905 = add i32 %3633, %2298
+ %3906 = add i32 %3634, %2299
+ %3907 = add i32 %3634, %2298
+ %3908 = fptrunc float %3376 to half
+ %3909 = fptrunc float %3377 to half
+ %3910 = fptrunc float %3378 to half
+ %3911 = fptrunc float %3379 to half
+ %3912 = fptrunc float %3382 to half
+ %3913 = fptrunc float %3383 to half
+ %3914 = fptrunc float %3384 to half
+ %3915 = fptrunc float %3385 to half
+ %3916 = fptrunc float %3388 to half
+ %3917 = fptrunc float %3389 to half
+ %3918 = fptrunc float %3390 to half
+ %3919 = fptrunc float %3391 to half
+ %3920 = fptrunc float %3394 to half
+ %3921 = fptrunc float %3395 to half
+ %3922 = fptrunc float %3396 to half
+ %3923 = fptrunc float %3397 to half
+ %3924 = insertelement <4 x half> poison, half %3908, i64 0
+ %3925 = insertelement <4 x half> %3924, half %3909, i64 1
+ %3926 = insertelement <4 x half> %3925, half %3910, i64 2
+ %3927 = insertelement <4 x half> %3926, half %3911, i64 3
+ %3928 = bitcast <4 x half> %3927 to <2 x i32>
+ %3929 = shl i32 %3904, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3928, ptr addrspace(8) %3659, i32 %3929, i32 0, i32 0)
+ %3930 = insertelement <4 x half> poison, half %3912, i64 0
+ %3931 = insertelement <4 x half> %3930, half %3913, i64 1
+ %3932 = insertelement <4 x half> %3931, half %3914, i64 2
+ %3933 = insertelement <4 x half> %3932, half %3915, i64 3
+ %3934 = bitcast <4 x half> %3933 to <2 x i32>
+ %3935 = shl i32 %3905, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3934, ptr addrspace(8) %3659, i32 %3935, i32 0, i32 0)
+ %3936 = insertelement <4 x half> poison, half %3916, i64 0
+ %3937 = insertelement <4 x half> %3936, half %3917, i64 1
+ %3938 = insertelement <4 x half> %3937, half %3918, i64 2
+ %3939 = insertelement <4 x half> %3938, half %3919, i64 3
+ %3940 = bitcast <4 x half> %3939 to <2 x i32>
+ %3941 = shl i32 %3906, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3940, ptr addrspace(8) %3659, i32 %3941, i32 0, i32 0)
+ %3942 = insertelement <4 x half> poison, half %3920, i64 0
+ %3943 = insertelement <4 x half> %3942, half %3921, i64 1
+ %3944 = insertelement <4 x half> %3943, half %3922, i64 2
+ %3945 = insertelement <4 x half> %3944, half %3923, i64 3
+ %3946 = bitcast <4 x half> %3945 to <2 x i32>
+ %3947 = shl i32 %3907, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3946, ptr addrspace(8) %3659, i32 %3947, i32 0, i32 0)
+ %3948 = add i32 %3633, %2297
+ %3949 = add i32 %3633, %2296
+ %3950 = add i32 %3634, %2297
+ %3951 = add i32 %3634, %2296
+ %3952 = fptrunc float %3400 to half
+ %3953 = fptrunc float %3401 to half
+ %3954 = fptrunc float %3402 to half
+ %3955 = fptrunc float %3403 to half
+ %3956 = fptrunc float %3406 to half
+ %3957 = fptrunc float %3407 to half
+ %3958 = fptrunc float %3408 to half
+ %3959 = fptrunc float %3409 to half
+ %3960 = fptrunc float %3412 to half
+ %3961 = fptrunc float %3413 to half
+ %3962 = fptrunc float %3414 to half
+ %3963 = fptrunc float %3415 to half
+ %3964 = fptrunc float %3418 to half
+ %3965 = fptrunc float %3419 to half
+ %3966 = fptrunc float %3420 to half
+ %3967 = fptrunc float %3421 to half
+ %3968 = insertelement <4 x half> poison, half %3952, i64 0
+ %3969 = insertelement <4 x half> %3968, half %3953, i64 1
+ %3970 = insertelement <4 x half> %3969, half %3954, i64 2
+ %3971 = insertelement <4 x half> %3970, half %3955, i64 3
+ %3972 = bitcast <4 x half> %3971 to <2 x i32>
+ %3973 = shl i32 %3948, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3972, ptr addrspace(8) %3659, i32 %3973, i32 0, i32 0)
+ %3974 = insertelement <4 x half> poison, half %3956, i64 0
+ %3975 = insertelement <4 x half> %3974, half %3957, i64 1
+ %3976 = insertelement <4 x half> %3975, half %3958, i64 2
+ %3977 = insertelement <4 x half> %3976, half %3959, i64 3
+ %3978 = bitcast <4 x half> %3977 to <2 x i32>
+ %3979 = shl i32 %3949, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3978, ptr addrspace(8) %3659, i32 %3979, i32 0, i32 0)
+ %3980 = insertelement <4 x half> poison, half %3960, i64 0
+ %3981 = insertelement <4 x half> %3980, half %3961, i64 1
+ %3982 = insertelement <4 x half> %3981, half %3962, i64 2
+ %3983 = insertelement <4 x half> %3982, half %3963, i64 3
+ %3984 = bitcast <4 x half> %3983 to <2 x i32>
+ %3985 = shl i32 %3950, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3984, ptr addrspace(8) %3659, i32 %3985, i32 0, i32 0)
+ %3986 = insertelement <4 x half> poison, half %3964, i64 0
+ %3987 = insertelement <4 x half> %3986, half %3965, i64 1
+ %3988 = insertelement <4 x half> %3987, half %3966, i64 2
+ %3989 = insertelement <4 x half> %3988, half %3967, i64 3
+ %3990 = bitcast <4 x half> %3989 to <2 x i32>
+ %3991 = shl i32 %3951, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3990, ptr addrspace(8) %3659, i32 %3991, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030)
+ %3992 = add i32 %3635, %2295
+ %3993 = add i32 %3635, %2302
+ %3994 = add i32 %3636, %2295
+ %3995 = add i32 %3636, %2302
+ %3996 = fptrunc float %3430 to half
+ %3997 = fptrunc float %3431 to half
+ %3998 = fptrunc float %3432 to half
+ %3999 = fptrunc float %3433 to half
+ %4000 = fptrunc float %3436 to half
+ %4001 = fptrunc float %3437 to half
+ %4002 = fptrunc float %3438 to half
+ %4003 = fptrunc float %3439 to half
+ %4004 = fptrunc float %3442 to half
+ %4005 = fptrunc float %3443 to half
+ %4006 = fptrunc float %3444 to half
+ %4007 = fptrunc float %3445 to half
+ %4008 = fptrunc float %3448 to half
+ %4009 = fptrunc float %3449 to half
+ %4010 = fptrunc float %3450 to half
+ %4011 = fptrunc float %3451 to half
+ %4012 = insertelement <4 x half> poison, half %3996, i64 0
+ %4013 = insertelement <4 x half> %4012, half %3997, i64 1
+ %4014 = insertelement <4 x half> %4013, half %3998, i64 2
+ %4015 = insertelement <4 x half> %4014, half %3999, i64 3
+ %4016 = bitcast <4 x half> %4015 to <2 x i32>
+ %4017 = shl i32 %3992, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4016, ptr addrspace(8) %3659, i32 %4017, i32 0, i32 0)
+ %4018 = insertelement <4 x half> poison, half %4000, i64 0
+ %4019 = insertelement <4 x half> %4018, half %4001, i64 1
+ %4020 = insertelement <4 x half> %4019, half %4002, i64 2
+ %4021 = insertelement <4 x half> %4020, half %4003, i64 3
+ %4022 = bitcast <4 x half> %4021 to <2 x i32>
+ %4023 = shl i32 %3993, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4022, ptr addrspace(8) %3659, i32 %4023, i32 0, i32 0)
+ %4024 = insertelement <4 x half> poison, half %4004, i64 0
+ %4025 = insertelement <4 x half> %4024, half %4005, i64 1
+ %4026 = insertelement <4 x half> %4025, half %4006, i64 2
+ %4027 = insertelement <4 x half> %4026, half %4007, i64 3
+ %4028 = bitcast <4 x half> %4027 to <2 x i32>
+ %4029 = shl i32 %3994, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4028, ptr addrspace(8) %3659, i32 %4029, i32 0, i32 0)
+ %4030 = insertelement <4 x half> poison, half %4008, i64 0
+ %4031 = insertelement <4 x half> %4030, half %4009, i64 1
+ %4032 = insertelement <4 x half> %4031, half %4010, i64 2
+ %4033 = insertelement <4 x half> %4032, half %4011, i64 3
+ %4034 = bitcast <4 x half> %4033 to <2 x i32>
+ %4035 = shl i32 %3995, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4034, ptr addrspace(8) %3659, i32 %4035, i32 0, i32 0)
+ %4036 = add i32 %3635, %2301
+ %4037 = add i32 %3635, %2300
+ %4038 = add i32 %3636, %2301
+ %4039 = add i32 %3636, %2300
+ %4040 = fptrunc float %3454 to half
+ %4041 = fptrunc float %3455 to half
+ %4042 = fptrunc float %3456 to half
+ %4043 = fptrunc float %3457 to half
+ %4044 = fptrunc float %3460 to half
+ %4045 = fptrunc float %3461 to half
+ %4046 = fptrunc float %3462 to half
+ %4047 = fptrunc float %3463 to half
+ %4048 = fptrunc float %3466 to half
+ %4049 = fptrunc float %3467 to half
+ %4050 = fptrunc float %3468 to half
+ %4051 = fptrunc float %3469 to half
+ %4052 = fptrunc float %3472 to half
+ %4053 = fptrunc float %3473 to half
+ %4054 = fptrunc float %3474 to half
+ %4055 = fptrunc float %3475 to half
+ %4056 = insertelement <4 x half> poison, half %4040, i64 0
+ %4057 = insertelement <4 x half> %4056, half %4041, i64 1
+ %4058 = insertelement <4 x half> %4057, half %4042, i64 2
+ %4059 = insertelement <4 x half> %4058, half %4043, i64 3
+ %4060 = bitcast <4 x half> %4059 to <2 x i32>
+ %4061 = shl i32 %4036, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4060, ptr addrspace(8) %3659, i32 %4061, i32 0, i32 0)
+ %4062 = insertelement <4 x half> poison, half %4044, i64 0
+ %4063 = insertelement <4 x half> %4062, half %4045, i64 1
+ %4064 = insertelement <4 x half> %4063, half %4046, i64 2
+ %4065 = insertelement <4 x half> %4064, half %4047, i64 3
+ %4066 = bitcast <4 x half> %4065 to <2 x i32>
+ %4067 = shl i32 %4037, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4066, ptr addrspace(8) %3659, i32 %4067, i32 0, i32 0)
+ %4068 = insertelement <4 x half> poison, half %4048, i64 0
+ %4069 = insertelement <4 x half> %4068, half %4049, i64 1
+ %4070 = insertelement <4 x half> %4069, half %4050, i64 2
+ %4071 = insertelement <4 x half> %4070, half %4051, i64 3
+ %4072 = bitcast <4 x half> %4071 to <2 x i32>
+ %4073 = shl i32 %4038, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4072, ptr addrspace(8) %3659, i32 %4073, i32 0, i32 0)
+ %4074 = insertelement <4 x half> poison, half %4052, i64 0
+ %4075 = insertelement <4 x half> %4074, half %4053, i64 1
+ %4076 = insertelement <4 x half> %4075, half %4054, i64 2
+ %4077 = insertelement <4 x half> %4076, half %4055, i64 3
+ %4078 = bitcast <4 x half> %4077 to <2 x i32>
+ %4079 = shl i32 %4039, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4078, ptr addrspace(8) %3659, i32 %4079, i32 0, i32 0)
+ %4080 = add i32 %3635, %2299
+ %4081 = add i32 %3635, %2298
+ %4082 = add i32 %3636, %2299
+ %4083 = add i32 %3636, %2298
+ %4084 = fptrunc float %3532 to half
+ %4085 = fptrunc float %3533 to half
+ %4086 = fptrunc float %3534 to half
+ %4087 = fptrunc float %3535 to half
+ %4088 = fptrunc float %3538 to half
+ %4089 = fptrunc float %3539 to half
+ %4090 = fptrunc float %3540 to half
+ %4091 = fptrunc float %3541 to half
+ %4092 = fptrunc float %3544 to half
+ %4093 = fptrunc float %3545 to half
+ %4094 = fptrunc float %3546 to half
+ %4095 = fptrunc float %3547 to half
+ %4096 = fptrunc float %3550 to half
+ %4097 = fptrunc float %3551 to half
+ %4098 = fptrunc float %3552 to half
+ %4099 = fptrunc float %3553 to half
+ %4100 = insertelement <4 x half> poison, half %4084, i64 0
+ %4101 = insertelement <4 x half> %4100, half %4085, i64 1
+ %4102 = insertelement <4 x half> %4101, half %4086, i64 2
+ %4103 = insertelement <4 x half> %4102, half %4087, i64 3
+ %4104 = bitcast <4 x half> %4103 to <2 x i32>
+ %4105 = shl i32 %4080, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4104, ptr addrspace(8) %3659, i32 %4105, i32 0, i32 0)
+ %4106 = insertelement <4 x half> poison, half %4088, i64 0
+ %4107 = insertelement <4 x half> %4106, half %4089, i64 1
+ %4108 = insertelement <4 x half> %4107, half %4090, i64 2
+ %4109 = insertelement <4 x half> %4108, half %4091, i64 3
+ %4110 = bitcast <4 x half> %4109 to <2 x i32>
+ %4111 = shl i32 %4081, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4110, ptr addrspace(8) %3659, i32 %4111, i32 0, i32 0)
+ %4112 = insertelement <4 x half> poison, half %4092, i64 0
+ %4113 = insertelement <4 x half> %4112, half %4093, i64 1
+ %4114 = insertelement <4 x half> %4113, half %4094, i64 2
+ %4115 = insertelement <4 x half> %4114, half %4095, i64 3
+ %4116 = bitcast <4 x half> %4115 to <2 x i32>
+ %4117 = shl i32 %4082, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4116, ptr addrspace(8) %3659, i32 %4117, i32 0, i32 0)
+ %4118 = insertelement <4 x half> poison, half %4096, i64 0
+ %4119 = insertelement <4 x half> %4118, half %4097, i64 1
+ %4120 = insertelement <4 x half> %4119, half %4098, i64 2
+ %4121 = insertelement <4 x half> %4120, half %4099, i64 3
+ %4122 = bitcast <4 x half> %4121 to <2 x i32>
+ %4123 = shl i32 %4083, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4122, ptr addrspace(8) %3659, i32 %4123, i32 0, i32 0)
+ %4124 = add i32 %3635, %2297
+ %4125 = add i32 %3635, %2296
+ %4126 = add i32 %3636, %2297
+ %4127 = add i32 %3636, %2296
+ %4128 = fptrunc float %3556 to half
+ %4129 = fptrunc float %3557 to half
+ %4130 = fptrunc float %3558 to half
+ %4131 = fptrunc float %3559 to half
+ %4132 = fptrunc float %3562 to half
+ %4133 = fptrunc float %3563 to half
+ %4134 = fptrunc float %3564 to half
+ %4135 = fptrunc float %3565 to half
+ %4136 = fptrunc float %3568 to half
+ %4137 = fptrunc float %3569 to half
+ %4138 = fptrunc float %3570 to half
+ %4139 = fptrunc float %3571 to half
+ %4140 = fptrunc float %3574 to half
+ %4141 = fptrunc float %3575 to half
+ %4142 = fptrunc float %3576 to half
+ %4143 = fptrunc float %3577 to half
+ %4144 = insertelement <4 x half> poison, half %4128, i64 0
+ %4145 = insertelement <4 x half> %4144, half %4129, i64 1
+ %4146 = insertelement <4 x half> %4145, half %4130, i64 2
+ %4147 = insertelement <4 x half> %4146, half %4131, i64 3
+ %4148 = bitcast <4 x half> %4147 to <2 x i32>
+ %4149 = shl i32 %4124, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4148, ptr addrspace(8) %3659, i32 %4149, i32 0, i32 0)
+ %4150 = insertelement <4 x half> poison, half %4132, i64 0
+ %4151 = insertelement <4 x half> %4150, half %4133, i64 1
+ %4152 = insertelement <4 x half> %4151, half %4134, i64 2
+ %4153 = insertelement <4 x half> %4152, half %4135, i64 3
+ %4154 = bitcast <4 x half> %4153 to <2 x i32>
+ %4155 = shl i32 %4125, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4154, ptr addrspace(8) %3659, i32 %4155, i32 0, i32 0)
+ %4156 = insertelement <4 x half> poison, half %4136, i64 0
+ %4157 = insertelement <4 x half> %4156, half %4137, i64 1
+ %4158 = insertelement <4 x half> %4157, half %4138, i64 2
+ %4159 = insertelement <4 x half> %4158, half %4139, i64 3
+ %4160 = bitcast <4 x half> %4159 to <2 x i32>
+ %4161 = shl i32 %4126, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4160, ptr addrspace(8) %3659, i32 %4161, i32 0, i32 0)
+ %4162 = insertelement <4 x half> poison, half %4140, i64 0
+ %4163 = insertelement <4 x half> %4162, half %4141, i64 1
+ %4164 = insertelement <4 x half> %4163, half %4142, i64 2
+ %4165 = insertelement <4 x half> %4164, half %4143, i64 3
+ %4166 = bitcast <4 x half> %4165 to <2 x i32>
+ %4167 = shl i32 %4127, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4166, ptr addrspace(8) %3659, i32 %4167, i32 0, i32 0)
+ tail call void @llvm.amdgcn.sched.barrier(i32 1030)
+ %4168 = add i32 %3637, %2295
+ %4169 = add i32 %3637, %2302
+ %4170 = add i32 %3638, %2295
+ %4171 = add i32 %3638, %2302
+ %4172 = fptrunc float %3484 to half
+ %4173 = fptrunc float %3485 to half
+ %4174 = fptrunc float %3486 to half
+ %4175 = fptrunc float %3487 to half
+ %4176 = fptrunc float %3490 to half
+ %4177 = fptrunc float %3491 to half
+ %4178 = fptrunc float %3492 to half
+ %4179 = fptrunc float %3493 to half
+ %4180 = fptrunc float %3496 to half
+ %4181 = fptrunc float %3497 to half
+ %4182 = fptrunc float %3498 to half
+ %4183 = fptrunc float %3499 to half
+ %4184 = fptrunc float %3502 to half
+ %4185 = fptrunc float %3503 to half
+ %4186 = fptrunc float %3504 to half
+ %4187 = fptrunc float %3505 to half
+ %4188 = insertelement <4 x half> poison, half %4172, i64 0
+ %4189 = insertelement <4 x half> %4188, half %4173, i64 1
+ %4190 = insertelement <4 x half> %4189, half %4174, i64 2
+ %4191 = insertelement <4 x half> %4190, half %4175, i64 3
+ %4192 = bitcast <4 x half> %4191 to <2 x i32>
+ %4193 = shl i32 %4168, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4192, ptr addrspace(8) %3659, i32 %4193, i32 0, i32 0)
+ %4194 = insertelement <4 x half> poison, half %4176, i64 0
+ %4195 = insertelement <4 x half> %4194, half %4177, i64 1
+ %4196 = insertelement <4 x half> %4195, half %4178, i64 2
+ %4197 = insertelement <4 x half> %4196, half %4179, i64 3
+ %4198 = bitcast <4 x half> %4197 to <2 x i32>
+ %4199 = shl i32 %4169, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4198, ptr addrspace(8) %3659, i32 %4199, i32 0, i32 0)
+ %4200 = insertelement <4 x half> poison, half %4180, i64 0
+ %4201 = insertelement <4 x half> %4200, half %4181, i64 1
+ %4202 = insertelement <4 x half> %4201, half %4182, i64 2
+ %4203 = insertelement <4 x half> %4202, half %4183, i64 3
+ %4204 = bitcast <4 x half> %4203 to <2 x i32>
+ %4205 = shl i32 %4170, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4204, ptr addrspace(8) %3659, i32 %4205, i32 0, i32 0)
+ %4206 = insertelement <4 x half> poison, half %4184, i64 0
+ %4207 = insertelement <4 x half> %4206, half %4185, i64 1
+ %4208 = insertelement <4 x half> %4207, half %4186, i64 2
+ %4209 = insertelement <4 x half> %4208, half %4187, i64 3
+ %4210 = bitcast <4 x half> %4209 to <2 x i32>
+ %4211 = shl i32 %4171, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4210, ptr addrspace(8) %3659, i32 %4211, i32 0, i32 0)
+ %4212 = add i32 %3637, %2301
+ %4213 = add i32 %3637, %2300
+ %4214 = add i32 %3638, %2301
+ %4215 = add i32 %3638, %2300
+ %4216 = fptrunc float %3508 to half
+ %4217 = fptrunc float %3509 to half
+ %4218 = fptrunc float %3510 to half
+ %4219 = fptrunc float %3511 to half
+ %4220 = fptrunc float %3514 to half
+ %4221 = fptrunc float %3515 to half
+ %4222 = fptrunc float %3516 to half
+ %4223 = fptrunc float %3517 to half
+ %4224 = fptrunc float %3520 to half
+ %4225 = fptrunc float %3521 to half
+ %4226 = fptrunc float %3522 to half
+ %4227 = fptrunc float %3523 to half
+ %4228 = fptrunc float %3526 to half
+ %4229 = fptrunc float %3527 to half
+ %4230 = fptrunc float %3528 to half
+ %4231 = fptrunc float %3529 to half
+ %4232 = insertelement <4 x half> poison, half %4216, i64 0
+ %4233 = insertelement <4 x half> %4232, half %4217, i64 1
+ %4234 = insertelement <4 x half> %4233, half %4218, i64 2
+ %4235 = insertelement <4 x half> %4234, half %4219, i64 3
+ %4236 = bitcast <4 x half> %4235 to <2 x i32>
+ %4237 = shl i32 %4212, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4236, ptr addrspace(8) %3659, i32 %4237, i32 0, i32 0)
+ %4238 = insertelement <4 x half> poison, half %4220, i64 0
+ %4239 = insertelement <4 x half> %4238, half %4221, i64 1
+ %4240 = insertelement <4 x half> %4239, half %4222, i64 2
+ %4241 = insertelement <4 x half> %4240, half %4223, i64 3
+ %4242 = bitcast <4 x half> %4241 to <2 x i32>
+ %4243 = shl i32 %4213, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4242, ptr addrspace(8) %3659, i32 %4243, i32 0, i32 0)
+ %4244 = insertelement <4 x half> poison, half %4224, i64 0
+ %4245 = insertelement <4 x half> %4244, half %4225, i64 1
+ %4246 = insertelement <4 x half> %4245, half %4226, i64 2
+ %4247 = insertelement <4 x half> %4246, half %4227, i64 3
+ %4248 = bitcast <4 x half> %4247 to <2 x i32>
+ %4249 = shl i32 %4214, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4248, ptr addrspace(8) %3659, i32 %4249, i32 0, i32 0)
+ %4250 = insertelement <4 x half> poison, half %4228, i64 0
+ %4251 = insertelement <4 x half> %4250, half %4229, i64 1
+ %4252 = insertelement <4 x half> %4251, half %4230, i64 2
+ %4253 = insertelement <4 x half> %4252, half %4231, i64 3
+ %4254 = bitcast <4 x half> %4253 to <2 x i32>
+ %4255 = shl i32 %4215, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4254, ptr addrspace(8) %3659, i32 %4255, i32 0, i32 0)
+ %4256 = add i32 %3637, %2299
+ %4257 = add i32 %3637, %2298
+ %4258 = add i32 %3638, %2299
+ %4259 = add i32 %3638, %2298
+ %4260 = fptrunc float %3580 to half
+ %4261 = fptrunc float %3581 to half
+ %4262 = fptrunc float %3582 to half
+ %4263 = fptrunc float %3583 to half
+ %4264 = fptrunc float %3586 to half
+ %4265 = fptrunc float %3587 to half
+ %4266 = fptrunc float %3588 to half
+ %4267 = fptrunc float %3589 to half
+ %4268 = fptrunc float %3592 to half
+ %4269 = fptrunc float %3593 to half
+ %4270 = fptrunc float %3594 to half
+ %4271 = fptrunc float %3595 to half
+ %4272 = fptrunc float %3598 to half
+ %4273 = fptrunc float %3599 to half
+ %4274 = fptrunc float %3600 to half
+ %4275 = fptrunc float %3601 to half
+ %4276 = insertelement <4 x half> poison, half %4260, i64 0
+ %4277 = insertelement <4 x half> %4276, half %4261, i64 1
+ %4278 = insertelement <4 x half> %4277, half %4262, i64 2
+ %4279 = insertelement <4 x half> %4278, half %4263, i64 3
+ %4280 = bitcast <4 x half> %4279 to <2 x i32>
+ %4281 = shl i32 %4256, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4280, ptr addrspace(8) %3659, i32 %4281, i32 0, i32 0)
+ %4282 = insertelement <4 x half> poison, half %4264, i64 0
+ %4283 = insertelement <4 x half> %4282, half %4265, i64 1
+ %4284 = insertelement <4 x half> %4283, half %4266, i64 2
+ %4285 = insertelement <4 x half> %4284, half %4267, i64 3
+ %4286 = bitcast <4 x half> %4285 to <2 x i32>
+ %4287 = shl i32 %4257, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4286, ptr addrspace(8) %3659, i32 %4287, i32 0, i32 0)
+ %4288 = insertelement <4 x half> poison, half %4268, i64 0
+ %4289 = insertelement <4 x half> %4288, half %4269, i64 1
+ %4290 = insertelement <4 x half> %4289, half %4270, i64 2
+ %4291 = insertelement <4 x half> %4290, half %4271, i64 3
+ %4292 = bitcast <4 x half> %4291 to <2 x i32>
+ %4293 = shl i32 %4258, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4292, ptr addrspace(8) %3659, i32 %4293, i32 0, i32 0)
+ %4294 = insertelement <4 x half> poison, half %4272, i64 0
+ %4295 = insertelement <4 x half> %4294, half %4273, i64 1
+ %4296 = insertelement <4 x half> %4295, half %4274, i64 2
+ %4297 = insertelement <4 x half> %4296, half %4275, i64 3
+ %4298 = bitcast <4 x half> %4297 to <2 x i32>
+ %4299 = shl i32 %4259, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4298, ptr addrspace(8) %3659, i32 %4299, i32 0, i32 0)
+ %4300 = add i32 %3637, %2297
+ %4301 = add i32 %3637, %2296
+ %4302 = add i32 %3638, %2297
+ %4303 = add i32 %3638, %2296
+ %4304 = fptrunc float %3604 to half
+ %4305 = fptrunc float %3605 to half
+ %4306 = fptrunc float %3606 to half
+ %4307 = fptrunc float %3607 to half
+ %4308 = fptrunc float %3610 to half
+ %4309 = fptrunc float %3611 to half
+ %4310 = fptrunc float %3612 to half
+ %4311 = fptrunc float %3613 to half
+ %4312 = fptrunc float %3616 to half
+ %4313 = fptrunc float %3617 to half
+ %4314 = fptrunc float %3618 to half
+ %4315 = fptrunc float %3619 to half
+ %4316 = fptrunc float %3622 to half
+ %4317 = fptrunc float %3623 to half
+ %4318 = fptrunc float %3624 to half
+ %4319 = fptrunc float %3625 to half
+ %4320 = insertelement <4 x half> poison, half %4304, i64 0
+ %4321 = insertelement <4 x half> %4320, half %4305, i64 1
+ %4322 = insertelement <4 x half> %4321, half %4306, i64 2
+ %4323 = insertelement <4 x half> %4322, half %4307, i64 3
+ %4324 = bitcast <4 x half> %4323 to <2 x i32>
+ %4325 = shl i32 %4300, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4324, ptr addrspace(8) %3659, i32 %4325, i32 0, i32 0)
+ %4326 = insertelement <4 x half> poison, half %4308, i64 0
+ %4327 = insertelement <4 x half> %4326, half %4309, i64 1
+ %4328 = insertelement <4 x half> %4327, half %4310, i64 2
+ %4329 = insertelement <4 x half> %4328, half %4311, i64 3
+ %4330 = bitcast <4 x half> %4329 to <2 x i32>
+ %4331 = shl i32 %4301, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4330, ptr addrspace(8) %3659, i32 %4331, i32 0, i32 0)
+ %4332 = insertelement <4 x half> poison, half %4312, i64 0
+ %4333 = insertelement <4 x half> %4332, half %4313, i64 1
+ %4334 = insertelement <4 x half> %4333, half %4314, i64 2
+ %4335 = insertelement <4 x half> %4334, half %4315, i64 3
+ %4336 = bitcast <4 x half> %4335 to <2 x i32>
+ %4337 = shl i32 %4302, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4336, ptr addrspace(8) %3659, i32 %4337, i32 0, i32 0)
+ %4338 = insertelement <4 x half> poison, half %4316, i64 0
+ %4339 = insertelement <4 x half> %4338, half %4317, i64 1
+ %4340 = insertelement <4 x half> %4339, half %4318, i64 2
+ %4341 = insertelement <4 x half> %4340, half %4319, i64 3
+ %4342 = bitcast <4 x half> %4341 to <2 x i32>
+ %4343 = shl i32 %4303, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4342, ptr addrspace(8) %3659, i32 %4343, i32 0, i32 0)
+ ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.smin.i32(i32, i32) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.amdgcn.workitem.id.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.assume(i1 noundef) #3
+
+; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn
+declare void @llvm.amdgcn.s.barrier() #4
+
+; Function Attrs: convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) #5
+
+; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn
+declare void @llvm.amdgcn.sched.barrier(i32 immarg) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #6
+
+; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn
+declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #4
+
+attributes #0 = { nofree norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0" "denormal-fp-math-f32"="ieee" "uniform-work-group-size"="false" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #4 = { convergent mustprogress nocallback nofree nounwind willreturn }
+attributes #5 = { convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!3 = !DIFile(filename: "<unknown>", directory: "")
+!4 = distinct !DISubprogram(name: "matmul_kernel", linkageName: "matmul_kernel", scope: !3, file: !3, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 222, column: 7, scope: !4)
+!8 = !DILocation(line: 224, column: 7, scope: !4)
+!9 = !DILocation(line: 225, column: 7, scope: !4)
+!10 = !DILocation(line: 230, column: 7, scope: !4)
+!11 = !DILocation(line: 231, column: 7, scope: !4)
+!12 = !DILocation(line: 232, column: 7, scope: !4)
+!13 = !DILocation(line: 233, column: 7, scope: !4)
+!14 = !DILocation(line: 234, column: 7, scope: !4)
+!15 = !DILocation(line: 235, column: 7, scope: !4)
+!16 = !DILocation(line: 236, column: 7, scope: !4)
+!17 = !DILocation(line: 237, column: 7, scope: !4)
+!18 = !DILocation(line: 238, column: 7, scope: !4)
+!19 = !DILocation(line: 241, column: 7, scope: !4)
+!20 = !DILocation(line: 242, column: 7, scope: !4)
+!21 = !DILocation(line: 247, column: 7, scope: !4)
+!22 = !DILocation(line: 249, column: 7, scope: !4)
+!23 = !DILocation(line: 250, column: 7, scope: !4)
+!24 = !DILocation(line: 255, column: 7, scope: !4)
+!25 = !DILocation(line: 256, column: 7, scope: !4)
+!26 = !DILocation(line: 257, column: 7, scope: !4)
+!27 = !DILocation(line: 258, column: 7, scope: !4)
+!28 = !DILocation(line: 259, column: 7, scope: !4)
+!29 = !DILocation(line: 260, column: 7, scope: !4)
+!30 = !DILocation(line: 261, column: 7, scope: !4)
+!31 = !DILocation(line: 262, column: 7, scope: !4)
+!32 = !DILocation(line: 263, column: 7, scope: !4)
+!33 = !DILocation(line: 266, column: 7, scope: !4)
+!34 = !DILocation(line: 267, column: 7, scope: !4)
+!35 = !DILocation(line: 275, column: 7, scope: !4)
+!36 = !DILocation(line: 280, column: 7, scope: !4)
+!37 = !DILocation(line: 285, column: 7, scope: !4)
+!38 = !DILocation(line: 286, column: 7, scope: !4)
+!39 = !DILocation(line: 287, column: 7, scope: !4)
+!40 = !DILocation(line: 288, column: 7, scope: !4)
+!41 = !DILocation(line: 289, column: 7, scope: !4)
+!42 = !DILocation(line: 290, column: 7, scope: !4)
+!43 = !DILocation(line: 291, column: 7, scope: !4)
+!44 = !DILocation(line: 292, column: 7, scope: !4)
+!45 = !DILocation(line: 293, column: 7, scope: !4)
+!46 = !DILocation(line: 298, column: 7, scope: !4)
+!47 = !DILocation(line: 299, column: 7, scope: !4)
+!48 = !DILocation(line: 300, column: 7, scope: !4)
+!49 = !DILocation(line: 301, column: 7, scope: !4)
+!50 = !DILocation(line: 302, column: 7, scope: !4)
+!51 = !DILocation(line: 303, column: 7, scope: !4)
+!52 = !DILocation(line: 304, column: 7, scope: !4)
+!53 = !DILocation(line: 305, column: 7, scope: !4)
+!54 = !DILocation(line: 306, column: 7, scope: !4)
+!55 = !DILocation(line: 309, column: 7, scope: !4)
+!56 = !DILocation(line: 310, column: 7, scope: !4)
+!57 = !DILocation(line: 315, column: 7, scope: !4)
+!58 = !DILocation(line: 316, column: 7, scope: !4)
+!59 = !DILocation(line: 321, column: 7, scope: !4)
+!60 = !DILocation(line: 322, column: 7, scope: !4)
+!61 = !DILocation(line: 323, column: 7, scope: !4)
+!62 = !DILocation(line: 324, column: 7, scope: !4)
+!63 = !DILocation(line: 325, column: 7, scope: !4)
+!64 = !DILocation(line: 326, column: 7, scope: !4)
+!65 = !DILocation(line: 327, column: 7, scope: !4)
+!66 = !DILocation(line: 328, column: 7, scope: !4)
+!67 = !DILocation(line: 329, column: 7, scope: !4)
+!68 = !DILocation(line: 334, column: 7, scope: !4)
+!69 = !DILocation(line: 335, column: 7, scope: !4)
+!70 = !DILocation(line: 336, column: 7, scope: !4)
+!71 = !DILocation(line: 337, column: 7, scope: !4)
+!72 = !DILocation(line: 338, column: 7, scope: !4)
+!73 = !DILocation(line: 339, column: 7, scope: !4)
+!74 = !DILocation(line: 340, column: 7, scope: !4)
+!75 = !DILocation(line: 341, column: 7, scope: !4)
+!76 = !DILocation(line: 342, column: 7, scope: !4)
+!77 = !DILocation(line: 347, column: 7, scope: !4)
+!78 = !DILocation(line: 348, column: 7, scope: !4)
+!79 = !DILocation(line: 364, column: 7, scope: !4)
+!80 = !DILocation(line: 365, column: 7, scope: !4)
+!81 = !DILocation(line: 366, column: 7, scope: !4)
+!82 = !DILocation(line: 367, column: 7, scope: !4)
+!83 = !DILocation(line: 368, column: 7, scope: !4)
+!84 = !DILocation(line: 369, column: 7, scope: !4)
+!85 = !DILocation(line: 370, column: 7, scope: !4)
+!86 = !DILocation(line: 371, column: 7, scope: !4)
+!87 = !DILocation(line: 372, column: 7, scope: !4)
+!88 = !DILocation(line: 373, column: 7, scope: !4)
+!89 = !DILocation(line: 374, column: 7, scope: !4)
+!90 = !DILocation(line: 375, column: 7, scope: !4)
+!91 = !DILocation(line: 376, column: 7, scope: !4)
+!92 = !DILocation(line: 377, column: 7, scope: !4)
+!93 = !DILocation(line: 378, column: 7, scope: !4)
+!94 = !DILocation(line: 379, column: 7, scope: !4)
+!95 = !DILocation(line: 380, column: 7, scope: !4)
+!96 = !DILocation(line: 381, column: 7, scope: !4)
+!97 = !DILocation(line: 382, column: 7, scope: !4)
+!98 = !DILocation(line: 383, column: 7, scope: !4)
+!99 = !DILocation(line: 384, column: 7, scope: !4)
+!100 = !DILocation(line: 385, column: 7, scope: !4)
+!101 = !DILocation(line: 386, column: 7, scope: !4)
+!102 = !DILocation(line: 387, column: 7, scope: !4)
+!103 = !DILocation(line: 388, column: 7, scope: !4)
+!104 = !DILocation(line: 389, column: 7, scope: !4)
+!105 = !DILocation(line: 390, column: 7, scope: !4)
+!106 = !DILocation(line: 391, column: 7, scope: !4)
+!107 = !DILocation(line: 392, column: 7, scope: !4)
+!108 = !DILocation(line: 393, column: 7, scope: !4)
+!109 = !DILocation(line: 394, column: 7, scope: !4)
+!110 = !DILocation(line: 395, column: 7, scope: !4)
+!111 = !DILocation(line: 396, column: 7, scope: !4)
+!112 = !DILocation(line: 412, column: 7, scope: !4)
+!113 = !DILocation(line: 413, column: 7, scope: !4)
+!114 = !DILocation(line: 414, column: 7, scope: !4)
+!115 = !DILocation(line: 415, column: 7, scope: !4)
+!116 = !DILocation(line: 416, column: 7, scope: !4)
+!117 = !DILocation(line: 417, column: 7, scope: !4)
+!118 = !DILocation(line: 418, column: 7, scope: !4)
+!119 = !DILocation(line: 419, column: 7, scope: !4)
+!120 = !DILocation(line: 420, column: 7, scope: !4)
+!121 = !DILocation(line: 421, column: 7, scope: !4)
+!122 = !DILocation(line: 422, column: 7, scope: !4)
+!123 = !DILocation(line: 423, column: 7, scope: !4)
+!124 = !DILocation(line: 424, column: 7, scope: !4)
+!125 = !DILocation(line: 425, column: 7, scope: !4)
+!126 = !DILocation(line: 426, column: 7, scope: !4)
+!127 = !DILocation(line: 427, column: 7, scope: !4)
+!128 = !DILocation(line: 428, column: 7, scope: !4)
+!129 = !DILocation(line: 429, column: 7, scope: !4)
+!130 = !DILocation(line: 430, column: 7, scope: !4)
+!131 = !DILocation(line: 431, column: 7, scope: !4)
+!132 = !DILocation(line: 432, column: 7, scope: !4)
+!133 = !DILocation(line: 433, column: 7, scope: !4)
+!134 = !DILocation(line: 434, column: 7, scope: !4)
+!135 = !DILocation(line: 435, column: 7, scope: !4)
+!136 = !DILocation(line: 436, column: 7, scope: !4)
+!137 = !DILocation(line: 437, column: 7, scope: !4)
+!138 = !DILocation(line: 438, column: 7, scope: !4)
+!139 = !DILocation(line: 439, column: 7, scope: !4)
+!140 = !DILocation(line: 440, column: 7, scope: !4)
+!141 = !DILocation(line: 441, column: 7, scope: !4)
+!142 = !DILocation(line: 442, column: 7, scope: !4)
+!143 = !DILocation(line: 443, column: 7, scope: !4)
+!144 = !DILocation(line: 444, column: 7, scope: !4)
+!145 = !DILocation(line: 449, column: 7, scope: !4)
+!146 = !DILocation(line: 450, column: 7, scope: !4)
+!147 = !DILocation(line: 456, column: 7, scope: !4)
+!148 = !DILocation(line: 457, column: 7, scope: !4)
+!149 = !DILocation(line: 458, column: 7, scope: !4)
+!150 = !DILocation(line: 459, column: 7, scope: !4)
+!151 = !DILocation(line: 460, column: 7, scope: !4)
+!152 = !DILocation(line: 461, column: 7, scope: !4)
+!153 = !DILocation(line: 462, column: 7, scope: !4)
+!154 = !DILocation(line: 463, column: 7, scope: !4)
+!155 = !DILocation(line: 464, column: 7, scope: !4)
+!156 = !DILocation(line: 469, column: 7, scope: !4)
+!157 = !DILocation(line: 470, column: 7, scope: !4)
+!158 = !DILocation(line: 471, column: 7, scope: !4)
+!159 = !DILocation(line: 472, column: 7, scope: !4)
+!160 = !DILocation(line: 473, column: 7, scope: !4)
+!161 = !DILocation(line: 474, column: 7, scope: !4)
+!162 = !DILocation(line: 475, column: 7, scope: !4)
+!163 = !DILocation(line: 476, column: 7, scope: !4)
+!164 = !DILocation(line: 477, column: 7, scope: !4)
+!165 = !DILocation(line: 480, column: 7, scope: !4)
+!166 = !DILocation(line: 481, column: 7, scope: !4)
+!167 = !DILocation(line: 166, column: 9, scope: !4)
+!168 = !DILocation(line: 174, column: 9, scope: !4)
+!169 = !DILocation(line: 175, column: 9, scope: !4)
+!170 = !DILocation(line: 176, column: 9, scope: !4)
+!171 = !DILocation(line: 177, column: 9, scope: !4)
+!172 = !DILocation(line: 178, column: 9, scope: !4)
+!173 = !DILocation(line: 179, column: 9, scope: !4)
+!174 = !DILocation(line: 180, column: 9, scope: !4)
+!175 = !DILocation(line: 181, column: 9, scope: !4)
+!176 = !DILocation(line: 182, column: 9, scope: !4)
+!177 = !DILocation(line: 183, column: 9, scope: !4)
+!178 = !DILocation(line: 184, column: 9, scope: !4)
+!179 = !DILocation(line: 185, column: 9, scope: !4)
+!180 = !DILocation(line: 186, column: 9, scope: !4)
+!181 = !DILocation(line: 187, column: 9, scope: !4)
+!182 = !DILocation(line: 188, column: 9, scope: !4)
diff --git a/llvm/test/CodeGen/AMDGPU/4_tlp_fast_no_barriers.llir b/llvm/test/CodeGen/AMDGPU/4_tlp_fast_no_barriers.llir
new file mode 100644
index 0000000000000..22f52e751006c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/4_tlp_fast_no_barriers.llir
@@ -0,0 +1,4774 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+ at global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: nofree norecurse nounwind
+define amdgpu_kernel void @matmul_kernel(ptr addrspace(1) inreg nocapture readonly %0, ptr addrspace(1) inreg nocapture readonly %1, ptr addrspace(1) inreg nocapture writeonly %2, ptr addrspace(1) inreg nocapture readnone %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, ptr addrspace(1) inreg nocapture readnone %11) local_unnamed_addr #0 !dbg !4 {
+ %13 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %14 = sdiv i32 %13, 8
+ %15 = mul i32 %13, 76
+ %16 = mul i32 %14, -607
+ %17 = add i32 %16, %15
+ %18 = add i32 %5, 255
+ %19 = sdiv i32 %18, 256
+ %20 = shl nsw i32 %19, 2
+ %.frozen = freeze i32 %20
+ %21 = sdiv i32 %17, %.frozen
+ %22 = shl nsw i32 %21, 2
+ %23 = mul i32 %21, %.frozen
+ %.decomposed = sub i32 %17, %23
+ %24 = add i32 %4, 255
+ %25 = sdiv i32 %24, 256
+ %26 = sub nsw i32 %25, %22
+ %27 = tail call i32 @llvm.smin.i32(i32 %26, i32 4)
+ %.decomposed.frozen = freeze i32 %.decomposed
+ %.frozen2426 = freeze i32 %27
+ %28 = sdiv i32 %.decomposed.frozen, %.frozen2426
+ %29 = mul i32 %28, %.frozen2426
+ %.decomposed2427 = sub i32 %.decomposed.frozen, %29
+ %30 = add nsw i32 %.decomposed2427, %22
+ %31 = shl i32 %30, 8
+ %32 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %33 = lshr i32 %32, 3
+ %34 = and i32 %33, 16
+ %35 = and i32 %33, 31
+ %36 = or disjoint i32 %35, 32
+ %37 = or disjoint i32 %35, 64
+ %38 = or disjoint i32 %35, 96
+ %39 = or disjoint i32 %35, 128
+ %40 = or disjoint i32 %35, 160
+ %41 = or disjoint i32 %35, 192
+ %42 = or disjoint i32 %35, 224
+ %43 = mul i32 %31, %7
+ %44 = mul i32 %7, %35
+ %45 = mul i32 %7, %36
+ %46 = mul i32 %7, %37
+ %47 = mul i32 %7, %38
+ %48 = mul i32 %7, %39
+ %49 = mul i32 %7, %40
+ %50 = mul i32 %7, %41
+ %51 = mul i32 %7, %42
+ %52 = sext i32 %43 to i64
+ %53 = getelementptr half, ptr addrspace(1) %0, i64 %52
+ %54 = shl i32 %32, 3
+ %55 = and i32 %54, 56
+ %56 = add i32 %44, %55
+ %57 = add i32 %45, %55
+ %58 = add i32 %46, %55
+ %59 = add i32 %47, %55
+ %60 = add i32 %48, %55
+ %61 = add i32 %49, %55
+ %62 = add i32 %50, %55
+ %63 = add i32 %51, %55
+ %64 = getelementptr i8, ptr addrspace(1) %53, i64 128
+ %65 = add i32 %6, 63
+ %66 = icmp sgt i32 %65, 63
+ %67 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %53, i16 0, i32 2147483646, i32 159744)
+ %68 = shl i32 %56, 1
+ %69 = select i1 %66, i32 %68, i32 -2147483648
+ %70 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %69, i32 0, i32 0)
+ %71 = shl i32 %57, 1
+ %72 = select i1 %66, i32 %71, i32 -2147483648
+ %73 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %72, i32 0, i32 0)
+ %74 = shl i32 %58, 1
+ %75 = select i1 %66, i32 %74, i32 -2147483648
+ %76 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %75, i32 0, i32 0)
+ %77 = shl i32 %59, 1
+ %78 = select i1 %66, i32 %77, i32 -2147483648
+ %79 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %78, i32 0, i32 0)
+ %80 = shl i32 %60, 1
+ %81 = select i1 %66, i32 %80, i32 -2147483648
+ %82 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %81, i32 0, i32 0)
+ %83 = shl i32 %61, 1
+ %84 = select i1 %66, i32 %83, i32 -2147483648
+ %85 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %84, i32 0, i32 0)
+ %86 = shl i32 %62, 1
+ %87 = select i1 %66, i32 %86, i32 -2147483648
+ %88 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %87, i32 0, i32 0)
+ %89 = shl i32 %63, 1
+ %90 = select i1 %66, i32 %89, i32 -2147483648
+ %91 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %67, i32 %90, i32 0, i32 0)
+ %92 = icmp sgt i32 %65, 127
+ %93 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %64, i16 0, i32 2147483646, i32 159744)
+ %94 = select i1 %92, i32 %68, i32 -2147483648
+ %95 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %94, i32 0, i32 0)
+ %96 = bitcast <4 x i32> %95 to <8 x half>
+ %97 = select i1 %92, i32 %71, i32 -2147483648
+ %98 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %97, i32 0, i32 0)
+ %99 = bitcast <4 x i32> %98 to <8 x half>
+ %100 = select i1 %92, i32 %74, i32 -2147483648
+ %101 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %100, i32 0, i32 0)
+ %102 = bitcast <4 x i32> %101 to <8 x half>
+ %103 = select i1 %92, i32 %77, i32 -2147483648
+ %104 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %103, i32 0, i32 0)
+ %105 = bitcast <4 x i32> %104 to <8 x half>
+ %106 = select i1 %92, i32 %80, i32 -2147483648
+ %107 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %106, i32 0, i32 0)
+ %108 = bitcast <4 x i32> %107 to <8 x half>
+ %109 = select i1 %92, i32 %83, i32 -2147483648
+ %110 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %109, i32 0, i32 0)
+ %111 = bitcast <4 x i32> %110 to <8 x half>
+ %112 = select i1 %92, i32 %86, i32 -2147483648
+ %113 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %112, i32 0, i32 0)
+ %114 = bitcast <4 x i32> %113 to <8 x half>
+ %115 = select i1 %92, i32 %89, i32 -2147483648
+ %116 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %93, i32 %115, i32 0, i32 0)
+ %117 = bitcast <4 x i32> %116 to <8 x half>
+ %118 = shl i32 %28, 8
+ %119 = mul i32 %118, %8
+ %120 = mul i32 %8, %35
+ %121 = mul i32 %8, %36
+ %122 = mul i32 %8, %37
+ %123 = mul i32 %8, %38
+ %124 = mul i32 %8, %39
+ %125 = mul i32 %8, %40
+ %126 = mul i32 %8, %41
+ %127 = mul i32 %8, %42
+ %128 = sext i32 %119 to i64
+ %129 = getelementptr half, ptr addrspace(1) %1, i64 %128
+ %130 = add i32 %120, %55
+ %131 = add i32 %121, %55
+ %132 = add i32 %122, %55
+ %133 = add i32 %123, %55
+ %134 = add i32 %124, %55
+ %135 = add i32 %125, %55
+ %136 = add i32 %126, %55
+ %137 = add i32 %127, %55
+ %138 = getelementptr i8, ptr addrspace(1) %129, i64 128
+ %139 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %129, i16 0, i32 2147483646, i32 159744)
+ %140 = shl i32 %130, 1
+ %141 = select i1 %66, i32 %140, i32 -2147483648
+ %142 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %141, i32 0, i32 0)
+ %143 = shl i32 %131, 1
+ %144 = select i1 %66, i32 %143, i32 -2147483648
+ %145 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %144, i32 0, i32 0)
+ %146 = shl i32 %132, 1
+ %147 = select i1 %66, i32 %146, i32 -2147483648
+ %148 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %147, i32 0, i32 0)
+ %149 = shl i32 %133, 1
+ %150 = select i1 %66, i32 %149, i32 -2147483648
+ %151 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %150, i32 0, i32 0)
+ %152 = shl i32 %134, 1
+ %153 = select i1 %66, i32 %152, i32 -2147483648
+ %154 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %153, i32 0, i32 0)
+ %155 = shl i32 %135, 1
+ %156 = select i1 %66, i32 %155, i32 -2147483648
+ %157 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %156, i32 0, i32 0)
+ %158 = shl i32 %136, 1
+ %159 = select i1 %66, i32 %158, i32 -2147483648
+ %160 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %159, i32 0, i32 0)
+ %161 = shl i32 %137, 1
+ %162 = select i1 %66, i32 %161, i32 -2147483648
+ %163 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %139, i32 %162, i32 0, i32 0)
+ %164 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %138, i16 0, i32 2147483646, i32 159744)
+ %165 = select i1 %92, i32 %140, i32 -2147483648
+ %166 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %165, i32 0, i32 0)
+ %167 = bitcast <4 x i32> %166 to <8 x half>
+ %168 = select i1 %92, i32 %143, i32 -2147483648
+ %169 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %168, i32 0, i32 0)
+ %170 = bitcast <4 x i32> %169 to <8 x half>
+ %171 = select i1 %92, i32 %146, i32 -2147483648
+ %172 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %171, i32 0, i32 0)
+ %173 = bitcast <4 x i32> %172 to <8 x half>
+ %174 = select i1 %92, i32 %149, i32 -2147483648
+ %175 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %174, i32 0, i32 0)
+ %176 = bitcast <4 x i32> %175 to <8 x half>
+ %177 = select i1 %92, i32 %152, i32 -2147483648
+ %178 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %177, i32 0, i32 0)
+ %179 = bitcast <4 x i32> %178 to <8 x half>
+ %180 = select i1 %92, i32 %155, i32 -2147483648
+ %181 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %180, i32 0, i32 0)
+ %182 = bitcast <4 x i32> %181 to <8 x half>
+ %183 = select i1 %92, i32 %158, i32 -2147483648
+ %184 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %183, i32 0, i32 0)
+ %185 = bitcast <4 x i32> %184 to <8 x half>
+ %186 = select i1 %92, i32 %161, i32 -2147483648
+ %187 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %164, i32 %186, i32 0, i32 0)
+ %188 = bitcast <4 x i32> %187 to <8 x half>
+ %189 = icmp sgt i32 %7, 0
+ tail call void @llvm.assume(i1 %189)
+ %190 = icmp sgt i32 %8, 0
+ tail call void @llvm.assume(i1 %190)
+ %191 = icmp sgt i32 %9, 0
+ tail call void @llvm.assume(i1 %191)
+ %192 = icmp sgt i32 %10, 0
+ tail call void @llvm.assume(i1 %192)
+ %193 = icmp sgt i32 %30, 0
+ tail call void @llvm.assume(i1 %193)
+ %194 = icmp sgt i32 %28, 0
+ tail call void @llvm.assume(i1 %194)
+ %195 = xor i32 %54, %32
+ %196 = and i32 %195, 56
+ %197 = shl nuw nsw i32 %35, 6
+ %198 = or disjoint i32 %197, %196
+ %199 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %198
+ store <4 x i32> %70, ptr addrspace(3) %199, align 16
+ %200 = or disjoint i32 %198, 2048
+ %201 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %200
+ store <4 x i32> %73, ptr addrspace(3) %201, align 16
+ %202 = or disjoint i32 %198, 4096
+ %203 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %202
+ store <4 x i32> %76, ptr addrspace(3) %203, align 16
+ %204 = or disjoint i32 %198, 6144
+ %205 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %204
+ store <4 x i32> %79, ptr addrspace(3) %205, align 16
+ %206 = or disjoint i32 %198, 8192
+ %207 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %206
+ store <4 x i32> %82, ptr addrspace(3) %207, align 16
+ %208 = or disjoint i32 %198, 10240
+ %209 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %208
+ store <4 x i32> %85, ptr addrspace(3) %209, align 16
+ %210 = or disjoint i32 %198, 12288
+ %211 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %210
+ store <4 x i32> %88, ptr addrspace(3) %211, align 16
+ %212 = or disjoint i32 %198, 14336
+ %213 = getelementptr inbounds half, ptr addrspace(3) @global_smem, i32 %212
+ store <4 x i32> %91, ptr addrspace(3) %213, align 16
+ %214 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %198
+ store <4 x i32> %142, ptr addrspace(3) %214, align 16
+ %215 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %200
+ store <4 x i32> %145, ptr addrspace(3) %215, align 16
+ %216 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %202
+ store <4 x i32> %148, ptr addrspace(3) %216, align 16
+ %217 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %204
+ store <4 x i32> %151, ptr addrspace(3) %217, align 16
+ %218 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %206
+ store <4 x i32> %154, ptr addrspace(3) %218, align 16
+ %219 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %208
+ store <4 x i32> %157, ptr addrspace(3) %219, align 16
+ %220 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %210
+ store <4 x i32> %160, ptr addrspace(3) %220, align 16
+ %221 = getelementptr inbounds half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %212
+ store <4 x i32> %163, ptr addrspace(3) %221, align 16
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %222 = and i32 %32, 15
+ %223 = lshr i32 %32, 4
+ %224 = and i32 %223, 3
+ %225 = or disjoint i32 %34, %222
+ %226 = and i32 %32, 7
+ %227 = xor i32 %224, %226
+ %228 = shl nuw nsw i32 %227, 3
+ %229 = shl nuw nsw i32 %225, 6
+ %230 = or disjoint i32 %229, %228
+ %231 = or disjoint i32 %229, 2048
+ %232 = or disjoint i32 %231, %228
+ %233 = getelementptr half, ptr addrspace(3) @global_smem, i32 %230
+ %234 = load <8 x half>, ptr addrspace(3) %233, align 16
+ %235 = getelementptr half, ptr addrspace(3) @global_smem, i32 %232
+ %236 = load <8 x half>, ptr addrspace(3) %235, align 16
+ %237 = lshr i32 %32, 2
+ %238 = and i32 %237, 16
+ %239 = or disjoint i32 %238, %222
+ %240 = shl nuw nsw i32 %239, 6
+ %241 = or disjoint i32 %228, %240
+ %242 = or disjoint i32 %241, 2048
+ %243 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %241
+ %244 = load <8 x half>, ptr addrspace(3) %243, align 16
+ %245 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %242
+ %246 = load <8 x half>, ptr addrspace(3) %245, align 16
+ %247 = or disjoint i32 %229, 4096
+ %248 = or disjoint i32 %247, %228
+ %249 = or disjoint i32 %229, 6144
+ %250 = or disjoint i32 %249, %228
+ %251 = getelementptr half, ptr addrspace(3) @global_smem, i32 %248
+ %252 = load <8 x half>, ptr addrspace(3) %251, align 16
+ %253 = getelementptr half, ptr addrspace(3) @global_smem, i32 %250
+ %254 = load <8 x half>, ptr addrspace(3) %253, align 16
+ %255 = or disjoint i32 %240, 4096
+ %256 = or disjoint i32 %255, %228
+ %257 = or disjoint i32 %256, 2048
+ %258 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %256
+ %259 = load <8 x half>, ptr addrspace(3) %258, align 16
+ %260 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %257
+ %261 = load <8 x half>, ptr addrspace(3) %260, align 16
+ %262 = icmp sgt i32 %65, 191
+ br i1 %262, label %.lr.ph, label %.._crit_edge_crit_edge
+
+.._crit_edge_crit_edge: ; preds = %12
+ %.pre = or disjoint i32 %240, 8192
+ %.pre1013 = or disjoint i32 %.pre, %228
+ %.pre1015 = or disjoint i32 %240, 12288
+ %.pre1017 = or disjoint i32 %.pre1015, %228
+ %.pre1019 = or disjoint i32 %229, 8192
+ %.pre1021 = or disjoint i32 %.pre1019, %228
+ %.pre1023 = or disjoint i32 %229, 10240
+ %.pre1025 = or disjoint i32 %.pre1023, %228
+ %.pre1027 = or disjoint i32 %229, 12288
+ %.pre1029 = or disjoint i32 %.pre1027, %228
+ %.pre1031 = or disjoint i32 %229, 14336
+ %.pre1033 = or disjoint i32 %.pre1031, %228
+ %.pre1035 = or disjoint i32 %224, 4
+ %.pre1037 = xor i32 %.pre1035, %226
+ %.pre1039 = shl nuw nsw i32 %.pre1037, 3
+ %.pre1041 = or disjoint i32 %.pre1039, %240
+ %.pre1043 = or disjoint i32 %.pre1039, %229
+ %.pre1045 = or disjoint i32 %231, %.pre1039
+ %.pre1047 = or disjoint i32 %.pre1039, %255
+ %.pre1049 = or disjoint i32 %247, %.pre1039
+ %.pre1051 = or disjoint i32 %249, %.pre1039
+ %.pre1053 = or disjoint i32 %.pre1039, %.pre
+ %.pre1055 = or disjoint i32 %.pre1053, 2048
+ %.pre1057 = or disjoint i32 %.pre1039, %.pre1015
+ %.pre1059 = or disjoint i32 %.pre1057, 2048
+ %.pre1061 = or disjoint i32 %.pre1019, %.pre1039
+ %.pre1063 = or disjoint i32 %.pre1023, %.pre1039
+ %.pre1065 = or disjoint i32 %.pre1027, %.pre1039
+ %.pre1067 = or disjoint i32 %.pre1031, %.pre1039
+ %263 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %264 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %265 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %266 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %267 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %268 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %269 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %270 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %271 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %272 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %273 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %274 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %275 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %276 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %277 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %278 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %279 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %280 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %281 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %282 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %283 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %284 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %285 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %286 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %287 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %288 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %289 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %290 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %291 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %292 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %293 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %294 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %295 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %296 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %297 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %298 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %299 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %300 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %301 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %302 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %303 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %304 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %305 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %306 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %307 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %308 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %309 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %310 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %311 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %312 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %313 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %314 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %315 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %316 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %317 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %318 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %319 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %320 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %321 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %322 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %323 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %324 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %325 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %326 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %327 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %328 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %329 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %330 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %331 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %332 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %333 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %334 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %335 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %336 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %337 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %338 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %339 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %340 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %341 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %342 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %343 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %344 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %345 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %346 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %347 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %348 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %349 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %350 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %351 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %352 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %353 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %354 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %355 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %356 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %357 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %358 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ br label %._crit_edge
+
+.lr.ph: ; preds = %12
+ %359 = lshr i32 %65, 6
+ %invariant.op404 = or disjoint i32 %240, 6144
+ %invariant.op402 = or disjoint i32 %240, 2048
+ %invariant.op400 = or disjoint i32 %228, 2048
+ %360 = or disjoint i32 %240, 8192
+ %361 = or disjoint i32 %360, %228
+ %.reass = or disjoint i32 %360, %invariant.op400
+ %362 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %361
+ %363 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass
+ %364 = or disjoint i32 %240, 12288
+ %365 = or disjoint i32 %364, %228
+ %.reass401 = or disjoint i32 %364, %invariant.op400
+ %366 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %365
+ %367 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass401
+ %368 = or disjoint i32 %229, 8192
+ %369 = or disjoint i32 %368, %228
+ %370 = or disjoint i32 %229, 10240
+ %371 = or disjoint i32 %370, %228
+ %372 = getelementptr half, ptr addrspace(3) @global_smem, i32 %369
+ %373 = getelementptr half, ptr addrspace(3) @global_smem, i32 %371
+ %374 = or disjoint i32 %229, 12288
+ %375 = or disjoint i32 %374, %228
+ %376 = or disjoint i32 %229, 14336
+ %377 = or disjoint i32 %376, %228
+ %378 = getelementptr half, ptr addrspace(3) @global_smem, i32 %375
+ %379 = getelementptr half, ptr addrspace(3) @global_smem, i32 %377
+ %380 = or disjoint i32 %224, 4
+ %381 = xor i32 %380, %226
+ %382 = shl nuw nsw i32 %381, 3
+ %383 = or disjoint i32 %382, %229
+ %384 = or disjoint i32 %231, %382
+ %385 = getelementptr half, ptr addrspace(3) @global_smem, i32 %383
+ %386 = getelementptr half, ptr addrspace(3) @global_smem, i32 %384
+ %387 = or disjoint i32 %382, %240
+ %.reass403 = or disjoint i32 %382, %invariant.op402
+ %388 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %387
+ %389 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass403
+ %390 = or disjoint i32 %382, %255
+ %.reass405 = or disjoint i32 %382, %invariant.op404
+ %391 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %390
+ %392 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.reass405
+ %393 = or disjoint i32 %247, %382
+ %394 = or disjoint i32 %249, %382
+ %395 = getelementptr half, ptr addrspace(3) @global_smem, i32 %393
+ %396 = getelementptr half, ptr addrspace(3) @global_smem, i32 %394
+ %397 = or disjoint i32 %382, %360
+ %398 = or disjoint i32 %397, 2048
+ %399 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %397
+ %400 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %398
+ %401 = or disjoint i32 %382, %364
+ %402 = or disjoint i32 %401, 2048
+ %403 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %401
+ %404 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %402
+ %405 = or disjoint i32 %368, %382
+ %406 = or disjoint i32 %370, %382
+ %407 = getelementptr half, ptr addrspace(3) @global_smem, i32 %405
+ %408 = getelementptr half, ptr addrspace(3) @global_smem, i32 %406
+ %409 = or disjoint i32 %374, %382
+ %410 = or disjoint i32 %376, %382
+ %411 = getelementptr half, ptr addrspace(3) @global_smem, i32 %409
+ %412 = getelementptr half, ptr addrspace(3) @global_smem, i32 %410
+ %413 = add nsw i32 %359, -3
+ %414 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %415 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %416 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %417 = shufflevector <8 x half> %261, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %418 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %419 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %420 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %421 = shufflevector <8 x half> %259, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %422 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %423 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %424 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %425 = shufflevector <8 x half> %254, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %426 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %427 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %428 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %429 = shufflevector <8 x half> %252, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %430 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %431 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %432 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %433 = shufflevector <8 x half> %246, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %434 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %435 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %436 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %437 = shufflevector <8 x half> %244, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %438 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %439 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %440 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %441 = shufflevector <8 x half> %236, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %442 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %443 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %444 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %445 = shufflevector <8 x half> %234, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %446 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %447 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %448 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %449 = shufflevector <8 x half> %188, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %450 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %451 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %452 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %453 = shufflevector <8 x half> %185, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %454 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %455 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %456 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %457 = shufflevector <8 x half> %182, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %458 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %459 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %460 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %461 = shufflevector <8 x half> %179, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %462 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %463 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %464 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %465 = shufflevector <8 x half> %176, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %466 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %467 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %468 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %469 = shufflevector <8 x half> %173, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %470 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %471 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %472 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %473 = shufflevector <8 x half> %170, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %474 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %475 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %476 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %477 = shufflevector <8 x half> %167, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %478 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %479 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %480 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %481 = shufflevector <8 x half> %117, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %482 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %483 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %484 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %485 = shufflevector <8 x half> %114, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %486 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %487 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %488 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %489 = shufflevector <8 x half> %111, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %490 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %491 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %492 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %493 = shufflevector <8 x half> %108, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %494 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %495 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %496 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %497 = shufflevector <8 x half> %105, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %498 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %499 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %500 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %501 = shufflevector <8 x half> %102, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %502 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %503 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %504 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %505 = shufflevector <8 x half> %99, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %506 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %507 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %508 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %509 = shufflevector <8 x half> %96, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ br label %510
+
+510: ; preds = %.lr.ph, %510
+ %511 = phi float [ 0.000000e+00, %.lr.ph ], [ %1824, %510 ]
+ %512 = phi float [ 0.000000e+00, %.lr.ph ], [ %1825, %510 ]
+ %513 = phi float [ 0.000000e+00, %.lr.ph ], [ %1826, %510 ]
+ %514 = phi float [ 0.000000e+00, %.lr.ph ], [ %1827, %510 ]
+ %515 = phi float [ 0.000000e+00, %.lr.ph ], [ %1830, %510 ]
+ %516 = phi float [ 0.000000e+00, %.lr.ph ], [ %1831, %510 ]
+ %517 = phi float [ 0.000000e+00, %.lr.ph ], [ %1832, %510 ]
+ %518 = phi float [ 0.000000e+00, %.lr.ph ], [ %1833, %510 ]
+ %519 = phi float [ 0.000000e+00, %.lr.ph ], [ %1836, %510 ]
+ %520 = phi float [ 0.000000e+00, %.lr.ph ], [ %1837, %510 ]
+ %521 = phi float [ 0.000000e+00, %.lr.ph ], [ %1838, %510 ]
+ %522 = phi float [ 0.000000e+00, %.lr.ph ], [ %1839, %510 ]
+ %523 = phi float [ 0.000000e+00, %.lr.ph ], [ %1842, %510 ]
+ %524 = phi float [ 0.000000e+00, %.lr.ph ], [ %1843, %510 ]
+ %525 = phi float [ 0.000000e+00, %.lr.ph ], [ %1844, %510 ]
+ %526 = phi float [ 0.000000e+00, %.lr.ph ], [ %1845, %510 ]
+ %527 = phi float [ 0.000000e+00, %.lr.ph ], [ %1800, %510 ]
+ %528 = phi float [ 0.000000e+00, %.lr.ph ], [ %1801, %510 ]
+ %529 = phi float [ 0.000000e+00, %.lr.ph ], [ %1802, %510 ]
+ %530 = phi float [ 0.000000e+00, %.lr.ph ], [ %1803, %510 ]
+ %531 = phi float [ 0.000000e+00, %.lr.ph ], [ %1806, %510 ]
+ %532 = phi float [ 0.000000e+00, %.lr.ph ], [ %1807, %510 ]
+ %533 = phi float [ 0.000000e+00, %.lr.ph ], [ %1808, %510 ]
+ %534 = phi float [ 0.000000e+00, %.lr.ph ], [ %1809, %510 ]
+ %535 = phi float [ 0.000000e+00, %.lr.ph ], [ %1812, %510 ]
+ %536 = phi float [ 0.000000e+00, %.lr.ph ], [ %1813, %510 ]
+ %537 = phi float [ 0.000000e+00, %.lr.ph ], [ %1814, %510 ]
+ %538 = phi float [ 0.000000e+00, %.lr.ph ], [ %1815, %510 ]
+ %539 = phi float [ 0.000000e+00, %.lr.ph ], [ %1818, %510 ]
+ %540 = phi float [ 0.000000e+00, %.lr.ph ], [ %1819, %510 ]
+ %541 = phi float [ 0.000000e+00, %.lr.ph ], [ %1820, %510 ]
+ %542 = phi float [ 0.000000e+00, %.lr.ph ], [ %1821, %510 ]
+ %543 = phi float [ 0.000000e+00, %.lr.ph ], [ %1720, %510 ]
+ %544 = phi float [ 0.000000e+00, %.lr.ph ], [ %1721, %510 ]
+ %545 = phi float [ 0.000000e+00, %.lr.ph ], [ %1722, %510 ]
+ %546 = phi float [ 0.000000e+00, %.lr.ph ], [ %1723, %510 ]
+ %547 = phi float [ 0.000000e+00, %.lr.ph ], [ %1726, %510 ]
+ %548 = phi float [ 0.000000e+00, %.lr.ph ], [ %1727, %510 ]
+ %549 = phi float [ 0.000000e+00, %.lr.ph ], [ %1728, %510 ]
+ %550 = phi float [ 0.000000e+00, %.lr.ph ], [ %1729, %510 ]
+ %551 = phi float [ 0.000000e+00, %.lr.ph ], [ %1732, %510 ]
+ %552 = phi float [ 0.000000e+00, %.lr.ph ], [ %1733, %510 ]
+ %553 = phi float [ 0.000000e+00, %.lr.ph ], [ %1734, %510 ]
+ %554 = phi float [ 0.000000e+00, %.lr.ph ], [ %1735, %510 ]
+ %555 = phi float [ 0.000000e+00, %.lr.ph ], [ %1738, %510 ]
+ %556 = phi float [ 0.000000e+00, %.lr.ph ], [ %1739, %510 ]
+ %557 = phi float [ 0.000000e+00, %.lr.ph ], [ %1740, %510 ]
+ %558 = phi float [ 0.000000e+00, %.lr.ph ], [ %1741, %510 ]
+ %559 = phi float [ 0.000000e+00, %.lr.ph ], [ %1696, %510 ]
+ %560 = phi float [ 0.000000e+00, %.lr.ph ], [ %1697, %510 ]
+ %561 = phi float [ 0.000000e+00, %.lr.ph ], [ %1698, %510 ]
+ %562 = phi float [ 0.000000e+00, %.lr.ph ], [ %1699, %510 ]
+ %563 = phi float [ 0.000000e+00, %.lr.ph ], [ %1702, %510 ]
+ %564 = phi float [ 0.000000e+00, %.lr.ph ], [ %1703, %510 ]
+ %565 = phi float [ 0.000000e+00, %.lr.ph ], [ %1704, %510 ]
+ %566 = phi float [ 0.000000e+00, %.lr.ph ], [ %1705, %510 ]
+ %567 = phi float [ 0.000000e+00, %.lr.ph ], [ %1708, %510 ]
+ %568 = phi float [ 0.000000e+00, %.lr.ph ], [ %1709, %510 ]
+ %569 = phi float [ 0.000000e+00, %.lr.ph ], [ %1710, %510 ]
+ %570 = phi float [ 0.000000e+00, %.lr.ph ], [ %1711, %510 ]
+ %571 = phi float [ 0.000000e+00, %.lr.ph ], [ %1714, %510 ]
+ %572 = phi float [ 0.000000e+00, %.lr.ph ], [ %1715, %510 ]
+ %573 = phi float [ 0.000000e+00, %.lr.ph ], [ %1716, %510 ]
+ %574 = phi float [ 0.000000e+00, %.lr.ph ], [ %1717, %510 ]
+ %575 = phi float [ 0.000000e+00, %.lr.ph ], [ %1772, %510 ]
+ %576 = phi float [ 0.000000e+00, %.lr.ph ], [ %1773, %510 ]
+ %577 = phi float [ 0.000000e+00, %.lr.ph ], [ %1774, %510 ]
+ %578 = phi float [ 0.000000e+00, %.lr.ph ], [ %1775, %510 ]
+ %579 = phi float [ 0.000000e+00, %.lr.ph ], [ %1778, %510 ]
+ %580 = phi float [ 0.000000e+00, %.lr.ph ], [ %1779, %510 ]
+ %581 = phi float [ 0.000000e+00, %.lr.ph ], [ %1780, %510 ]
+ %582 = phi float [ 0.000000e+00, %.lr.ph ], [ %1781, %510 ]
+ %583 = phi float [ 0.000000e+00, %.lr.ph ], [ %1784, %510 ]
+ %584 = phi float [ 0.000000e+00, %.lr.ph ], [ %1785, %510 ]
+ %585 = phi float [ 0.000000e+00, %.lr.ph ], [ %1786, %510 ]
+ %586 = phi float [ 0.000000e+00, %.lr.ph ], [ %1787, %510 ]
+ %587 = phi float [ 0.000000e+00, %.lr.ph ], [ %1790, %510 ]
+ %588 = phi float [ 0.000000e+00, %.lr.ph ], [ %1791, %510 ]
+ %589 = phi float [ 0.000000e+00, %.lr.ph ], [ %1792, %510 ]
+ %590 = phi float [ 0.000000e+00, %.lr.ph ], [ %1793, %510 ]
+ %591 = phi float [ 0.000000e+00, %.lr.ph ], [ %1744, %510 ]
+ %592 = phi float [ 0.000000e+00, %.lr.ph ], [ %1745, %510 ]
+ %593 = phi float [ 0.000000e+00, %.lr.ph ], [ %1746, %510 ]
+ %594 = phi float [ 0.000000e+00, %.lr.ph ], [ %1747, %510 ]
+ %595 = phi float [ 0.000000e+00, %.lr.ph ], [ %1750, %510 ]
+ %596 = phi float [ 0.000000e+00, %.lr.ph ], [ %1751, %510 ]
+ %597 = phi float [ 0.000000e+00, %.lr.ph ], [ %1752, %510 ]
+ %598 = phi float [ 0.000000e+00, %.lr.ph ], [ %1753, %510 ]
+ %599 = phi float [ 0.000000e+00, %.lr.ph ], [ %1756, %510 ]
+ %600 = phi float [ 0.000000e+00, %.lr.ph ], [ %1757, %510 ]
+ %601 = phi float [ 0.000000e+00, %.lr.ph ], [ %1758, %510 ]
+ %602 = phi float [ 0.000000e+00, %.lr.ph ], [ %1759, %510 ]
+ %603 = phi float [ 0.000000e+00, %.lr.ph ], [ %1762, %510 ]
+ %604 = phi float [ 0.000000e+00, %.lr.ph ], [ %1763, %510 ]
+ %605 = phi float [ 0.000000e+00, %.lr.ph ], [ %1764, %510 ]
+ %606 = phi float [ 0.000000e+00, %.lr.ph ], [ %1765, %510 ]
+ %607 = phi float [ 0.000000e+00, %.lr.ph ], [ %1668, %510 ]
+ %608 = phi float [ 0.000000e+00, %.lr.ph ], [ %1669, %510 ]
+ %609 = phi float [ 0.000000e+00, %.lr.ph ], [ %1670, %510 ]
+ %610 = phi float [ 0.000000e+00, %.lr.ph ], [ %1671, %510 ]
+ %611 = phi float [ 0.000000e+00, %.lr.ph ], [ %1674, %510 ]
+ %612 = phi float [ 0.000000e+00, %.lr.ph ], [ %1675, %510 ]
+ %613 = phi float [ 0.000000e+00, %.lr.ph ], [ %1676, %510 ]
+ %614 = phi float [ 0.000000e+00, %.lr.ph ], [ %1677, %510 ]
+ %615 = phi float [ 0.000000e+00, %.lr.ph ], [ %1680, %510 ]
+ %616 = phi float [ 0.000000e+00, %.lr.ph ], [ %1681, %510 ]
+ %617 = phi float [ 0.000000e+00, %.lr.ph ], [ %1682, %510 ]
+ %618 = phi float [ 0.000000e+00, %.lr.ph ], [ %1683, %510 ]
+ %619 = phi float [ 0.000000e+00, %.lr.ph ], [ %1686, %510 ]
+ %620 = phi float [ 0.000000e+00, %.lr.ph ], [ %1687, %510 ]
+ %621 = phi float [ 0.000000e+00, %.lr.ph ], [ %1688, %510 ]
+ %622 = phi float [ 0.000000e+00, %.lr.ph ], [ %1689, %510 ]
+ %623 = phi float [ 0.000000e+00, %.lr.ph ], [ %1644, %510 ]
+ %624 = phi float [ 0.000000e+00, %.lr.ph ], [ %1645, %510 ]
+ %625 = phi float [ 0.000000e+00, %.lr.ph ], [ %1646, %510 ]
+ %626 = phi float [ 0.000000e+00, %.lr.ph ], [ %1647, %510 ]
+ %627 = phi float [ 0.000000e+00, %.lr.ph ], [ %1650, %510 ]
+ %628 = phi float [ 0.000000e+00, %.lr.ph ], [ %1651, %510 ]
+ %629 = phi float [ 0.000000e+00, %.lr.ph ], [ %1652, %510 ]
+ %630 = phi float [ 0.000000e+00, %.lr.ph ], [ %1653, %510 ]
+ %631 = phi float [ 0.000000e+00, %.lr.ph ], [ %1656, %510 ]
+ %632 = phi float [ 0.000000e+00, %.lr.ph ], [ %1657, %510 ]
+ %633 = phi float [ 0.000000e+00, %.lr.ph ], [ %1658, %510 ]
+ %634 = phi float [ 0.000000e+00, %.lr.ph ], [ %1659, %510 ]
+ %635 = phi float [ 0.000000e+00, %.lr.ph ], [ %1662, %510 ]
+ %636 = phi float [ 0.000000e+00, %.lr.ph ], [ %1663, %510 ]
+ %637 = phi float [ 0.000000e+00, %.lr.ph ], [ %1664, %510 ]
+ %638 = phi float [ 0.000000e+00, %.lr.ph ], [ %1665, %510 ]
+ %639 = phi float [ 0.000000e+00, %.lr.ph ], [ %1558, %510 ]
+ %640 = phi float [ 0.000000e+00, %.lr.ph ], [ %1559, %510 ]
+ %641 = phi float [ 0.000000e+00, %.lr.ph ], [ %1560, %510 ]
+ %642 = phi float [ 0.000000e+00, %.lr.ph ], [ %1561, %510 ]
+ %643 = phi float [ 0.000000e+00, %.lr.ph ], [ %1564, %510 ]
+ %644 = phi float [ 0.000000e+00, %.lr.ph ], [ %1565, %510 ]
+ %645 = phi float [ 0.000000e+00, %.lr.ph ], [ %1566, %510 ]
+ %646 = phi float [ 0.000000e+00, %.lr.ph ], [ %1567, %510 ]
+ %647 = phi float [ 0.000000e+00, %.lr.ph ], [ %1570, %510 ]
+ %648 = phi float [ 0.000000e+00, %.lr.ph ], [ %1571, %510 ]
+ %649 = phi float [ 0.000000e+00, %.lr.ph ], [ %1572, %510 ]
+ %650 = phi float [ 0.000000e+00, %.lr.ph ], [ %1573, %510 ]
+ %651 = phi float [ 0.000000e+00, %.lr.ph ], [ %1576, %510 ]
+ %652 = phi float [ 0.000000e+00, %.lr.ph ], [ %1577, %510 ]
+ %653 = phi float [ 0.000000e+00, %.lr.ph ], [ %1578, %510 ]
+ %654 = phi float [ 0.000000e+00, %.lr.ph ], [ %1579, %510 ]
+ %655 = phi float [ 0.000000e+00, %.lr.ph ], [ %1534, %510 ]
+ %656 = phi float [ 0.000000e+00, %.lr.ph ], [ %1535, %510 ]
+ %657 = phi float [ 0.000000e+00, %.lr.ph ], [ %1536, %510 ]
+ %658 = phi float [ 0.000000e+00, %.lr.ph ], [ %1537, %510 ]
+ %659 = phi float [ 0.000000e+00, %.lr.ph ], [ %1540, %510 ]
+ %660 = phi float [ 0.000000e+00, %.lr.ph ], [ %1541, %510 ]
+ %661 = phi float [ 0.000000e+00, %.lr.ph ], [ %1542, %510 ]
+ %662 = phi float [ 0.000000e+00, %.lr.ph ], [ %1543, %510 ]
+ %663 = phi float [ 0.000000e+00, %.lr.ph ], [ %1546, %510 ]
+ %664 = phi float [ 0.000000e+00, %.lr.ph ], [ %1547, %510 ]
+ %665 = phi float [ 0.000000e+00, %.lr.ph ], [ %1548, %510 ]
+ %666 = phi float [ 0.000000e+00, %.lr.ph ], [ %1549, %510 ]
+ %667 = phi float [ 0.000000e+00, %.lr.ph ], [ %1552, %510 ]
+ %668 = phi float [ 0.000000e+00, %.lr.ph ], [ %1553, %510 ]
+ %669 = phi float [ 0.000000e+00, %.lr.ph ], [ %1554, %510 ]
+ %670 = phi float [ 0.000000e+00, %.lr.ph ], [ %1555, %510 ]
+ %671 = phi float [ 0.000000e+00, %.lr.ph ], [ %1396, %510 ]
+ %672 = phi float [ 0.000000e+00, %.lr.ph ], [ %1397, %510 ]
+ %673 = phi float [ 0.000000e+00, %.lr.ph ], [ %1398, %510 ]
+ %674 = phi float [ 0.000000e+00, %.lr.ph ], [ %1399, %510 ]
+ %675 = phi float [ 0.000000e+00, %.lr.ph ], [ %1402, %510 ]
+ %676 = phi float [ 0.000000e+00, %.lr.ph ], [ %1403, %510 ]
+ %677 = phi float [ 0.000000e+00, %.lr.ph ], [ %1404, %510 ]
+ %678 = phi float [ 0.000000e+00, %.lr.ph ], [ %1405, %510 ]
+ %679 = phi float [ 0.000000e+00, %.lr.ph ], [ %1408, %510 ]
+ %680 = phi float [ 0.000000e+00, %.lr.ph ], [ %1409, %510 ]
+ %681 = phi float [ 0.000000e+00, %.lr.ph ], [ %1410, %510 ]
+ %682 = phi float [ 0.000000e+00, %.lr.ph ], [ %1411, %510 ]
+ %683 = phi float [ 0.000000e+00, %.lr.ph ], [ %1414, %510 ]
+ %684 = phi float [ 0.000000e+00, %.lr.ph ], [ %1415, %510 ]
+ %685 = phi float [ 0.000000e+00, %.lr.ph ], [ %1416, %510 ]
+ %686 = phi float [ 0.000000e+00, %.lr.ph ], [ %1417, %510 ]
+ %687 = phi float [ 0.000000e+00, %.lr.ph ], [ %1372, %510 ]
+ %688 = phi float [ 0.000000e+00, %.lr.ph ], [ %1373, %510 ]
+ %689 = phi float [ 0.000000e+00, %.lr.ph ], [ %1374, %510 ]
+ %690 = phi float [ 0.000000e+00, %.lr.ph ], [ %1375, %510 ]
+ %691 = phi float [ 0.000000e+00, %.lr.ph ], [ %1378, %510 ]
+ %692 = phi float [ 0.000000e+00, %.lr.ph ], [ %1379, %510 ]
+ %693 = phi float [ 0.000000e+00, %.lr.ph ], [ %1380, %510 ]
+ %694 = phi float [ 0.000000e+00, %.lr.ph ], [ %1381, %510 ]
+ %695 = phi float [ 0.000000e+00, %.lr.ph ], [ %1384, %510 ]
+ %696 = phi float [ 0.000000e+00, %.lr.ph ], [ %1385, %510 ]
+ %697 = phi float [ 0.000000e+00, %.lr.ph ], [ %1386, %510 ]
+ %698 = phi float [ 0.000000e+00, %.lr.ph ], [ %1387, %510 ]
+ %699 = phi float [ 0.000000e+00, %.lr.ph ], [ %1390, %510 ]
+ %700 = phi float [ 0.000000e+00, %.lr.ph ], [ %1391, %510 ]
+ %701 = phi float [ 0.000000e+00, %.lr.ph ], [ %1392, %510 ]
+ %702 = phi float [ 0.000000e+00, %.lr.ph ], [ %1393, %510 ]
+ %703 = phi float [ 0.000000e+00, %.lr.ph ], [ %1510, %510 ]
+ %704 = phi float [ 0.000000e+00, %.lr.ph ], [ %1511, %510 ]
+ %705 = phi float [ 0.000000e+00, %.lr.ph ], [ %1512, %510 ]
+ %706 = phi float [ 0.000000e+00, %.lr.ph ], [ %1513, %510 ]
+ %707 = phi float [ 0.000000e+00, %.lr.ph ], [ %1516, %510 ]
+ %708 = phi float [ 0.000000e+00, %.lr.ph ], [ %1517, %510 ]
+ %709 = phi float [ 0.000000e+00, %.lr.ph ], [ %1518, %510 ]
+ %710 = phi float [ 0.000000e+00, %.lr.ph ], [ %1519, %510 ]
+ %711 = phi float [ 0.000000e+00, %.lr.ph ], [ %1522, %510 ]
+ %712 = phi float [ 0.000000e+00, %.lr.ph ], [ %1523, %510 ]
+ %713 = phi float [ 0.000000e+00, %.lr.ph ], [ %1524, %510 ]
+ %714 = phi float [ 0.000000e+00, %.lr.ph ], [ %1525, %510 ]
+ %715 = phi float [ 0.000000e+00, %.lr.ph ], [ %1528, %510 ]
+ %716 = phi float [ 0.000000e+00, %.lr.ph ], [ %1529, %510 ]
+ %717 = phi float [ 0.000000e+00, %.lr.ph ], [ %1530, %510 ]
+ %718 = phi float [ 0.000000e+00, %.lr.ph ], [ %1531, %510 ]
+ %719 = phi float [ 0.000000e+00, %.lr.ph ], [ %1482, %510 ]
+ %720 = phi float [ 0.000000e+00, %.lr.ph ], [ %1483, %510 ]
+ %721 = phi float [ 0.000000e+00, %.lr.ph ], [ %1484, %510 ]
+ %722 = phi float [ 0.000000e+00, %.lr.ph ], [ %1485, %510 ]
+ %723 = phi float [ 0.000000e+00, %.lr.ph ], [ %1488, %510 ]
+ %724 = phi float [ 0.000000e+00, %.lr.ph ], [ %1489, %510 ]
+ %725 = phi float [ 0.000000e+00, %.lr.ph ], [ %1490, %510 ]
+ %726 = phi float [ 0.000000e+00, %.lr.ph ], [ %1491, %510 ]
+ %727 = phi float [ 0.000000e+00, %.lr.ph ], [ %1494, %510 ]
+ %728 = phi float [ 0.000000e+00, %.lr.ph ], [ %1495, %510 ]
+ %729 = phi float [ 0.000000e+00, %.lr.ph ], [ %1496, %510 ]
+ %730 = phi float [ 0.000000e+00, %.lr.ph ], [ %1497, %510 ]
+ %731 = phi float [ 0.000000e+00, %.lr.ph ], [ %1500, %510 ]
+ %732 = phi float [ 0.000000e+00, %.lr.ph ], [ %1501, %510 ]
+ %733 = phi float [ 0.000000e+00, %.lr.ph ], [ %1502, %510 ]
+ %734 = phi float [ 0.000000e+00, %.lr.ph ], [ %1503, %510 ]
+ %735 = phi float [ 0.000000e+00, %.lr.ph ], [ %1340, %510 ]
+ %736 = phi float [ 0.000000e+00, %.lr.ph ], [ %1341, %510 ]
+ %737 = phi float [ 0.000000e+00, %.lr.ph ], [ %1342, %510 ]
+ %738 = phi float [ 0.000000e+00, %.lr.ph ], [ %1343, %510 ]
+ %739 = phi float [ 0.000000e+00, %.lr.ph ], [ %1346, %510 ]
+ %740 = phi float [ 0.000000e+00, %.lr.ph ], [ %1347, %510 ]
+ %741 = phi float [ 0.000000e+00, %.lr.ph ], [ %1348, %510 ]
+ %742 = phi float [ 0.000000e+00, %.lr.ph ], [ %1349, %510 ]
+ %743 = phi float [ 0.000000e+00, %.lr.ph ], [ %1352, %510 ]
+ %744 = phi float [ 0.000000e+00, %.lr.ph ], [ %1353, %510 ]
+ %745 = phi float [ 0.000000e+00, %.lr.ph ], [ %1354, %510 ]
+ %746 = phi float [ 0.000000e+00, %.lr.ph ], [ %1355, %510 ]
+ %747 = phi float [ 0.000000e+00, %.lr.ph ], [ %1358, %510 ]
+ %748 = phi float [ 0.000000e+00, %.lr.ph ], [ %1359, %510 ]
+ %749 = phi float [ 0.000000e+00, %.lr.ph ], [ %1360, %510 ]
+ %750 = phi float [ 0.000000e+00, %.lr.ph ], [ %1361, %510 ]
+ %751 = phi ptr addrspace(1) [ %138, %.lr.ph ], [ %1620, %510 ]
+ %752 = phi ptr addrspace(1) [ %64, %.lr.ph ], [ %1458, %510 ]
+ %753 = phi float [ 0.000000e+00, %.lr.ph ], [ %1308, %510 ]
+ %754 = phi float [ 0.000000e+00, %.lr.ph ], [ %1309, %510 ]
+ %755 = phi float [ 0.000000e+00, %.lr.ph ], [ %1310, %510 ]
+ %756 = phi float [ 0.000000e+00, %.lr.ph ], [ %1311, %510 ]
+ %757 = phi float [ 0.000000e+00, %.lr.ph ], [ %1314, %510 ]
+ %758 = phi float [ 0.000000e+00, %.lr.ph ], [ %1315, %510 ]
+ %759 = phi float [ 0.000000e+00, %.lr.ph ], [ %1316, %510 ]
+ %760 = phi float [ 0.000000e+00, %.lr.ph ], [ %1317, %510 ]
+ %761 = phi float [ 0.000000e+00, %.lr.ph ], [ %1320, %510 ]
+ %762 = phi float [ 0.000000e+00, %.lr.ph ], [ %1321, %510 ]
+ %763 = phi float [ 0.000000e+00, %.lr.ph ], [ %1322, %510 ]
+ %764 = phi float [ 0.000000e+00, %.lr.ph ], [ %1323, %510 ]
+ %765 = phi float [ 0.000000e+00, %.lr.ph ], [ %1326, %510 ]
+ %766 = phi float [ 0.000000e+00, %.lr.ph ], [ %1327, %510 ]
+ %767 = phi float [ 0.000000e+00, %.lr.ph ], [ %1328, %510 ]
+ %768 = phi float [ 0.000000e+00, %.lr.ph ], [ %1329, %510 ]
+ %769 = phi i32 [ 0, %.lr.ph ], [ %1846, %510 ]
+ %770 = phi <2 x half> [ %414, %.lr.ph ], [ %1910, %510 ]
+ %771 = phi <2 x half> [ %415, %.lr.ph ], [ %1909, %510 ]
+ %772 = phi <2 x half> [ %416, %.lr.ph ], [ %1908, %510 ]
+ %773 = phi <2 x half> [ %417, %.lr.ph ], [ %1907, %510 ]
+ %774 = phi <2 x half> [ %418, %.lr.ph ], [ %1906, %510 ]
+ %775 = phi <2 x half> [ %419, %.lr.ph ], [ %1905, %510 ]
+ %776 = phi <2 x half> [ %420, %.lr.ph ], [ %1904, %510 ]
+ %777 = phi <2 x half> [ %421, %.lr.ph ], [ %1903, %510 ]
+ %778 = phi <2 x half> [ %422, %.lr.ph ], [ %1902, %510 ]
+ %779 = phi <2 x half> [ %423, %.lr.ph ], [ %1901, %510 ]
+ %780 = phi <2 x half> [ %424, %.lr.ph ], [ %1900, %510 ]
+ %781 = phi <2 x half> [ %425, %.lr.ph ], [ %1899, %510 ]
+ %782 = phi <2 x half> [ %426, %.lr.ph ], [ %1898, %510 ]
+ %783 = phi <2 x half> [ %427, %.lr.ph ], [ %1897, %510 ]
+ %784 = phi <2 x half> [ %428, %.lr.ph ], [ %1896, %510 ]
+ %785 = phi <2 x half> [ %429, %.lr.ph ], [ %1895, %510 ]
+ %786 = phi <2 x half> [ %430, %.lr.ph ], [ %1894, %510 ]
+ %787 = phi <2 x half> [ %431, %.lr.ph ], [ %1893, %510 ]
+ %788 = phi <2 x half> [ %432, %.lr.ph ], [ %1892, %510 ]
+ %789 = phi <2 x half> [ %433, %.lr.ph ], [ %1891, %510 ]
+ %790 = phi <2 x half> [ %434, %.lr.ph ], [ %1890, %510 ]
+ %791 = phi <2 x half> [ %435, %.lr.ph ], [ %1889, %510 ]
+ %792 = phi <2 x half> [ %436, %.lr.ph ], [ %1888, %510 ]
+ %793 = phi <2 x half> [ %437, %.lr.ph ], [ %1887, %510 ]
+ %794 = phi <2 x half> [ %438, %.lr.ph ], [ %1886, %510 ]
+ %795 = phi <2 x half> [ %439, %.lr.ph ], [ %1885, %510 ]
+ %796 = phi <2 x half> [ %440, %.lr.ph ], [ %1884, %510 ]
+ %797 = phi <2 x half> [ %441, %.lr.ph ], [ %1883, %510 ]
+ %798 = phi <2 x half> [ %442, %.lr.ph ], [ %1882, %510 ]
+ %799 = phi <2 x half> [ %443, %.lr.ph ], [ %1881, %510 ]
+ %800 = phi <2 x half> [ %444, %.lr.ph ], [ %1880, %510 ]
+ %801 = phi <2 x half> [ %445, %.lr.ph ], [ %1879, %510 ]
+ %802 = phi <2 x half> [ %446, %.lr.ph ], [ %1878, %510 ]
+ %803 = phi <2 x half> [ %447, %.lr.ph ], [ %1942, %510 ]
+ %804 = phi <2 x half> [ %448, %.lr.ph ], [ %1941, %510 ]
+ %805 = phi <2 x half> [ %449, %.lr.ph ], [ %1877, %510 ]
+ %806 = phi <2 x half> [ %450, %.lr.ph ], [ %1876, %510 ]
+ %807 = phi <2 x half> [ %451, %.lr.ph ], [ %1940, %510 ]
+ %808 = phi <2 x half> [ %452, %.lr.ph ], [ %1939, %510 ]
+ %809 = phi <2 x half> [ %453, %.lr.ph ], [ %1875, %510 ]
+ %810 = phi <2 x half> [ %454, %.lr.ph ], [ %1874, %510 ]
+ %811 = phi <2 x half> [ %455, %.lr.ph ], [ %1938, %510 ]
+ %812 = phi <2 x half> [ %456, %.lr.ph ], [ %1937, %510 ]
+ %813 = phi <2 x half> [ %457, %.lr.ph ], [ %1873, %510 ]
+ %814 = phi <2 x half> [ %458, %.lr.ph ], [ %1872, %510 ]
+ %815 = phi <2 x half> [ %459, %.lr.ph ], [ %1936, %510 ]
+ %816 = phi <2 x half> [ %460, %.lr.ph ], [ %1935, %510 ]
+ %817 = phi <2 x half> [ %461, %.lr.ph ], [ %1871, %510 ]
+ %818 = phi <2 x half> [ %462, %.lr.ph ], [ %1870, %510 ]
+ %819 = phi <2 x half> [ %463, %.lr.ph ], [ %1934, %510 ]
+ %820 = phi <2 x half> [ %464, %.lr.ph ], [ %1933, %510 ]
+ %821 = phi <2 x half> [ %465, %.lr.ph ], [ %1869, %510 ]
+ %822 = phi <2 x half> [ %466, %.lr.ph ], [ %1868, %510 ]
+ %823 = phi <2 x half> [ %467, %.lr.ph ], [ %1932, %510 ]
+ %824 = phi <2 x half> [ %468, %.lr.ph ], [ %1931, %510 ]
+ %825 = phi <2 x half> [ %469, %.lr.ph ], [ %1867, %510 ]
+ %826 = phi <2 x half> [ %470, %.lr.ph ], [ %1866, %510 ]
+ %827 = phi <2 x half> [ %471, %.lr.ph ], [ %1930, %510 ]
+ %828 = phi <2 x half> [ %472, %.lr.ph ], [ %1929, %510 ]
+ %829 = phi <2 x half> [ %473, %.lr.ph ], [ %1865, %510 ]
+ %830 = phi <2 x half> [ %474, %.lr.ph ], [ %1864, %510 ]
+ %831 = phi <2 x half> [ %475, %.lr.ph ], [ %1928, %510 ]
+ %832 = phi <2 x half> [ %476, %.lr.ph ], [ %1927, %510 ]
+ %833 = phi <2 x half> [ %477, %.lr.ph ], [ %1863, %510 ]
+ %834 = phi <2 x half> [ %478, %.lr.ph ], [ %1862, %510 ]
+ %835 = phi <2 x half> [ %479, %.lr.ph ], [ %1926, %510 ]
+ %836 = phi <2 x half> [ %480, %.lr.ph ], [ %1925, %510 ]
+ %837 = phi <2 x half> [ %481, %.lr.ph ], [ %1861, %510 ]
+ %838 = phi <2 x half> [ %482, %.lr.ph ], [ %1860, %510 ]
+ %839 = phi <2 x half> [ %483, %.lr.ph ], [ %1924, %510 ]
+ %840 = phi <2 x half> [ %484, %.lr.ph ], [ %1923, %510 ]
+ %841 = phi <2 x half> [ %485, %.lr.ph ], [ %1859, %510 ]
+ %842 = phi <2 x half> [ %486, %.lr.ph ], [ %1858, %510 ]
+ %843 = phi <2 x half> [ %487, %.lr.ph ], [ %1922, %510 ]
+ %844 = phi <2 x half> [ %488, %.lr.ph ], [ %1921, %510 ]
+ %845 = phi <2 x half> [ %489, %.lr.ph ], [ %1857, %510 ]
+ %846 = phi <2 x half> [ %490, %.lr.ph ], [ %1856, %510 ]
+ %847 = phi <2 x half> [ %491, %.lr.ph ], [ %1920, %510 ]
+ %848 = phi <2 x half> [ %492, %.lr.ph ], [ %1919, %510 ]
+ %849 = phi <2 x half> [ %493, %.lr.ph ], [ %1855, %510 ]
+ %850 = phi <2 x half> [ %494, %.lr.ph ], [ %1854, %510 ]
+ %851 = phi <2 x half> [ %495, %.lr.ph ], [ %1918, %510 ]
+ %852 = phi <2 x half> [ %496, %.lr.ph ], [ %1917, %510 ]
+ %853 = phi <2 x half> [ %497, %.lr.ph ], [ %1853, %510 ]
+ %854 = phi <2 x half> [ %498, %.lr.ph ], [ %1852, %510 ]
+ %855 = phi <2 x half> [ %499, %.lr.ph ], [ %1916, %510 ]
+ %856 = phi <2 x half> [ %500, %.lr.ph ], [ %1915, %510 ]
+ %857 = phi <2 x half> [ %501, %.lr.ph ], [ %1851, %510 ]
+ %858 = phi <2 x half> [ %502, %.lr.ph ], [ %1850, %510 ]
+ %859 = phi <2 x half> [ %503, %.lr.ph ], [ %1914, %510 ]
+ %860 = phi <2 x half> [ %504, %.lr.ph ], [ %1913, %510 ]
+ %861 = phi <2 x half> [ %505, %.lr.ph ], [ %1849, %510 ]
+ %862 = phi <2 x half> [ %506, %.lr.ph ], [ %1848, %510 ]
+ %863 = phi <2 x half> [ %507, %.lr.ph ], [ %1912, %510 ]
+ %864 = phi <2 x half> [ %508, %.lr.ph ], [ %1911, %510 ]
+ %865 = phi <2 x half> [ %509, %.lr.ph ], [ %1847, %510 ]
+ %866 = shufflevector <2 x half> %801, <2 x half> %800, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %867 = shufflevector <2 x half> %799, <2 x half> %798, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %868 = shufflevector <2 x half> %797, <2 x half> %796, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %869 = shufflevector <2 x half> %795, <2 x half> %794, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %870 = shufflevector <2 x half> %793, <2 x half> %792, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %871 = shufflevector <2 x half> %791, <2 x half> %790, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %872 = shufflevector <2 x half> %789, <2 x half> %788, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %873 = shufflevector <2 x half> %787, <2 x half> %786, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %874 = insertelement <4 x float> poison, float %753, i64 0
+ %875 = insertelement <4 x float> %874, float %754, i64 1
+ %876 = insertelement <4 x float> %875, float %755, i64 2
+ %877 = insertelement <4 x float> %876, float %756, i64 3
+ %878 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %866, <4 x float> %877, i32 0, i32 0, i32 0)
+ %879 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %867, <4 x float> %878, i32 0, i32 0, i32 0)
+ %880 = insertelement <4 x float> poison, float %757, i64 0
+ %881 = insertelement <4 x float> %880, float %758, i64 1
+ %882 = insertelement <4 x float> %881, float %759, i64 2
+ %883 = insertelement <4 x float> %882, float %760, i64 3
+ %884 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %866, <4 x float> %883, i32 0, i32 0, i32 0)
+ %885 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %867, <4 x float> %884, i32 0, i32 0, i32 0)
+ %886 = insertelement <4 x float> poison, float %761, i64 0
+ %887 = insertelement <4 x float> %886, float %762, i64 1
+ %888 = insertelement <4 x float> %887, float %763, i64 2
+ %889 = insertelement <4 x float> %888, float %764, i64 3
+ %890 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %868, <4 x float> %889, i32 0, i32 0, i32 0)
+ %891 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %869, <4 x float> %890, i32 0, i32 0, i32 0)
+ %892 = insertelement <4 x float> poison, float %765, i64 0
+ %893 = insertelement <4 x float> %892, float %766, i64 1
+ %894 = insertelement <4 x float> %893, float %767, i64 2
+ %895 = insertelement <4 x float> %894, float %768, i64 3
+ %896 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %868, <4 x float> %895, i32 0, i32 0, i32 0)
+ %897 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %869, <4 x float> %896, i32 0, i32 0, i32 0)
+ %898 = shufflevector <2 x half> %777, <2 x half> %776, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %899 = shufflevector <2 x half> %775, <2 x half> %774, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %900 = shufflevector <2 x half> %773, <2 x half> %772, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %901 = shufflevector <2 x half> %771, <2 x half> %770, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %902 = insertelement <4 x float> poison, float %735, i64 0
+ %903 = insertelement <4 x float> %902, float %736, i64 1
+ %904 = insertelement <4 x float> %903, float %737, i64 2
+ %905 = insertelement <4 x float> %904, float %738, i64 3
+ %906 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %866, <4 x float> %905, i32 0, i32 0, i32 0)
+ %907 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %867, <4 x float> %906, i32 0, i32 0, i32 0)
+ %908 = insertelement <4 x float> poison, float %739, i64 0
+ %909 = insertelement <4 x float> %908, float %740, i64 1
+ %910 = insertelement <4 x float> %909, float %741, i64 2
+ %911 = insertelement <4 x float> %910, float %742, i64 3
+ %912 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %866, <4 x float> %911, i32 0, i32 0, i32 0)
+ %913 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %867, <4 x float> %912, i32 0, i32 0, i32 0)
+ %914 = insertelement <4 x float> poison, float %743, i64 0
+ %915 = insertelement <4 x float> %914, float %744, i64 1
+ %916 = insertelement <4 x float> %915, float %745, i64 2
+ %917 = insertelement <4 x float> %916, float %746, i64 3
+ %918 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %868, <4 x float> %917, i32 0, i32 0, i32 0)
+ %919 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %869, <4 x float> %918, i32 0, i32 0, i32 0)
+ %920 = insertelement <4 x float> poison, float %747, i64 0
+ %921 = insertelement <4 x float> %920, float %748, i64 1
+ %922 = insertelement <4 x float> %921, float %749, i64 2
+ %923 = insertelement <4 x float> %922, float %750, i64 3
+ %924 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %868, <4 x float> %923, i32 0, i32 0, i32 0)
+ %925 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %869, <4 x float> %924, i32 0, i32 0, i32 0)
+ %926 = load <8 x half>, ptr addrspace(3) %362, align 16
+ %927 = load <8 x half>, ptr addrspace(3) %363, align 16
+ %928 = load <8 x half>, ptr addrspace(3) %366, align 16
+ %929 = load <8 x half>, ptr addrspace(3) %367, align 16
+ %930 = shufflevector <2 x half> %785, <2 x half> %784, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %931 = shufflevector <2 x half> %783, <2 x half> %782, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %932 = shufflevector <2 x half> %781, <2 x half> %780, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %933 = shufflevector <2 x half> %779, <2 x half> %778, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %934 = insertelement <4 x float> poison, float %687, i64 0
+ %935 = insertelement <4 x float> %934, float %688, i64 1
+ %936 = insertelement <4 x float> %935, float %689, i64 2
+ %937 = insertelement <4 x float> %936, float %690, i64 3
+ %938 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %930, <4 x float> %937, i32 0, i32 0, i32 0)
+ %939 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %931, <4 x float> %938, i32 0, i32 0, i32 0)
+ %940 = insertelement <4 x float> poison, float %691, i64 0
+ %941 = insertelement <4 x float> %940, float %692, i64 1
+ %942 = insertelement <4 x float> %941, float %693, i64 2
+ %943 = insertelement <4 x float> %942, float %694, i64 3
+ %944 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %930, <4 x float> %943, i32 0, i32 0, i32 0)
+ %945 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %931, <4 x float> %944, i32 0, i32 0, i32 0)
+ %946 = insertelement <4 x float> poison, float %695, i64 0
+ %947 = insertelement <4 x float> %946, float %696, i64 1
+ %948 = insertelement <4 x float> %947, float %697, i64 2
+ %949 = insertelement <4 x float> %948, float %698, i64 3
+ %950 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %932, <4 x float> %949, i32 0, i32 0, i32 0)
+ %951 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %933, <4 x float> %950, i32 0, i32 0, i32 0)
+ %952 = insertelement <4 x float> poison, float %699, i64 0
+ %953 = insertelement <4 x float> %952, float %700, i64 1
+ %954 = insertelement <4 x float> %953, float %701, i64 2
+ %955 = insertelement <4 x float> %954, float %702, i64 3
+ %956 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %932, <4 x float> %955, i32 0, i32 0, i32 0)
+ %957 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %933, <4 x float> %956, i32 0, i32 0, i32 0)
+ %958 = insertelement <4 x float> poison, float %671, i64 0
+ %959 = insertelement <4 x float> %958, float %672, i64 1
+ %960 = insertelement <4 x float> %959, float %673, i64 2
+ %961 = insertelement <4 x float> %960, float %674, i64 3
+ %962 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %930, <4 x float> %961, i32 0, i32 0, i32 0)
+ %963 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %931, <4 x float> %962, i32 0, i32 0, i32 0)
+ %964 = insertelement <4 x float> poison, float %675, i64 0
+ %965 = insertelement <4 x float> %964, float %676, i64 1
+ %966 = insertelement <4 x float> %965, float %677, i64 2
+ %967 = insertelement <4 x float> %966, float %678, i64 3
+ %968 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %930, <4 x float> %967, i32 0, i32 0, i32 0)
+ %969 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %931, <4 x float> %968, i32 0, i32 0, i32 0)
+ %970 = insertelement <4 x float> poison, float %679, i64 0
+ %971 = insertelement <4 x float> %970, float %680, i64 1
+ %972 = insertelement <4 x float> %971, float %681, i64 2
+ %973 = insertelement <4 x float> %972, float %682, i64 3
+ %974 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %932, <4 x float> %973, i32 0, i32 0, i32 0)
+ %975 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %933, <4 x float> %974, i32 0, i32 0, i32 0)
+ %976 = insertelement <4 x float> poison, float %683, i64 0
+ %977 = insertelement <4 x float> %976, float %684, i64 1
+ %978 = insertelement <4 x float> %977, float %685, i64 2
+ %979 = insertelement <4 x float> %978, float %686, i64 3
+ %980 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %932, <4 x float> %979, i32 0, i32 0, i32 0)
+ %981 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %933, <4 x float> %980, i32 0, i32 0, i32 0)
+ %982 = shufflevector <8 x half> %926, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %983 = shufflevector <8 x half> %926, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %984 = shufflevector <8 x half> %927, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %985 = shufflevector <8 x half> %927, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %986 = insertelement <4 x float> poison, float %719, i64 0
+ %987 = insertelement <4 x float> %986, float %720, i64 1
+ %988 = insertelement <4 x float> %987, float %721, i64 2
+ %989 = insertelement <4 x float> %988, float %722, i64 3
+ %990 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %866, <4 x float> %989, i32 0, i32 0, i32 0)
+ %991 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %867, <4 x float> %990, i32 0, i32 0, i32 0)
+ %992 = insertelement <4 x float> poison, float %723, i64 0
+ %993 = insertelement <4 x float> %992, float %724, i64 1
+ %994 = insertelement <4 x float> %993, float %725, i64 2
+ %995 = insertelement <4 x float> %994, float %726, i64 3
+ %996 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %866, <4 x float> %995, i32 0, i32 0, i32 0)
+ %997 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %867, <4 x float> %996, i32 0, i32 0, i32 0)
+ %998 = insertelement <4 x float> poison, float %727, i64 0
+ %999 = insertelement <4 x float> %998, float %728, i64 1
+ %1000 = insertelement <4 x float> %999, float %729, i64 2
+ %1001 = insertelement <4 x float> %1000, float %730, i64 3
+ %1002 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %868, <4 x float> %1001, i32 0, i32 0, i32 0)
+ %1003 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %869, <4 x float> %1002, i32 0, i32 0, i32 0)
+ %1004 = insertelement <4 x float> poison, float %731, i64 0
+ %1005 = insertelement <4 x float> %1004, float %732, i64 1
+ %1006 = insertelement <4 x float> %1005, float %733, i64 2
+ %1007 = insertelement <4 x float> %1006, float %734, i64 3
+ %1008 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %868, <4 x float> %1007, i32 0, i32 0, i32 0)
+ %1009 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %869, <4 x float> %1008, i32 0, i32 0, i32 0)
+ %1010 = shufflevector <8 x half> %928, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1011 = shufflevector <8 x half> %928, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1012 = shufflevector <8 x half> %929, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1013 = shufflevector <8 x half> %929, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1014 = insertelement <4 x float> poison, float %703, i64 0
+ %1015 = insertelement <4 x float> %1014, float %704, i64 1
+ %1016 = insertelement <4 x float> %1015, float %705, i64 2
+ %1017 = insertelement <4 x float> %1016, float %706, i64 3
+ %1018 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %866, <4 x float> %1017, i32 0, i32 0, i32 0)
+ %1019 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %867, <4 x float> %1018, i32 0, i32 0, i32 0)
+ %1020 = insertelement <4 x float> poison, float %707, i64 0
+ %1021 = insertelement <4 x float> %1020, float %708, i64 1
+ %1022 = insertelement <4 x float> %1021, float %709, i64 2
+ %1023 = insertelement <4 x float> %1022, float %710, i64 3
+ %1024 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %866, <4 x float> %1023, i32 0, i32 0, i32 0)
+ %1025 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %867, <4 x float> %1024, i32 0, i32 0, i32 0)
+ %1026 = insertelement <4 x float> poison, float %711, i64 0
+ %1027 = insertelement <4 x float> %1026, float %712, i64 1
+ %1028 = insertelement <4 x float> %1027, float %713, i64 2
+ %1029 = insertelement <4 x float> %1028, float %714, i64 3
+ %1030 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %868, <4 x float> %1029, i32 0, i32 0, i32 0)
+ %1031 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %869, <4 x float> %1030, i32 0, i32 0, i32 0)
+ %1032 = insertelement <4 x float> poison, float %715, i64 0
+ %1033 = insertelement <4 x float> %1032, float %716, i64 1
+ %1034 = insertelement <4 x float> %1033, float %717, i64 2
+ %1035 = insertelement <4 x float> %1034, float %718, i64 3
+ %1036 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %868, <4 x float> %1035, i32 0, i32 0, i32 0)
+ %1037 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %869, <4 x float> %1036, i32 0, i32 0, i32 0)
+ %1038 = load <8 x half>, ptr addrspace(3) %372, align 16
+ %1039 = load <8 x half>, ptr addrspace(3) %373, align 16
+ %1040 = load <8 x half>, ptr addrspace(3) %378, align 16
+ %1041 = load <8 x half>, ptr addrspace(3) %379, align 16
+ %1042 = insertelement <4 x float> poison, float %655, i64 0
+ %1043 = insertelement <4 x float> %1042, float %656, i64 1
+ %1044 = insertelement <4 x float> %1043, float %657, i64 2
+ %1045 = insertelement <4 x float> %1044, float %658, i64 3
+ %1046 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %930, <4 x float> %1045, i32 0, i32 0, i32 0)
+ %1047 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %931, <4 x float> %1046, i32 0, i32 0, i32 0)
+ %1048 = insertelement <4 x float> poison, float %659, i64 0
+ %1049 = insertelement <4 x float> %1048, float %660, i64 1
+ %1050 = insertelement <4 x float> %1049, float %661, i64 2
+ %1051 = insertelement <4 x float> %1050, float %662, i64 3
+ %1052 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %930, <4 x float> %1051, i32 0, i32 0, i32 0)
+ %1053 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %931, <4 x float> %1052, i32 0, i32 0, i32 0)
+ %1054 = insertelement <4 x float> poison, float %663, i64 0
+ %1055 = insertelement <4 x float> %1054, float %664, i64 1
+ %1056 = insertelement <4 x float> %1055, float %665, i64 2
+ %1057 = insertelement <4 x float> %1056, float %666, i64 3
+ %1058 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %932, <4 x float> %1057, i32 0, i32 0, i32 0)
+ %1059 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %933, <4 x float> %1058, i32 0, i32 0, i32 0)
+ %1060 = insertelement <4 x float> poison, float %667, i64 0
+ %1061 = insertelement <4 x float> %1060, float %668, i64 1
+ %1062 = insertelement <4 x float> %1061, float %669, i64 2
+ %1063 = insertelement <4 x float> %1062, float %670, i64 3
+ %1064 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %932, <4 x float> %1063, i32 0, i32 0, i32 0)
+ %1065 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %933, <4 x float> %1064, i32 0, i32 0, i32 0)
+ %1066 = insertelement <4 x float> poison, float %639, i64 0
+ %1067 = insertelement <4 x float> %1066, float %640, i64 1
+ %1068 = insertelement <4 x float> %1067, float %641, i64 2
+ %1069 = insertelement <4 x float> %1068, float %642, i64 3
+ %1070 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %930, <4 x float> %1069, i32 0, i32 0, i32 0)
+ %1071 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %931, <4 x float> %1070, i32 0, i32 0, i32 0)
+ %1072 = insertelement <4 x float> poison, float %643, i64 0
+ %1073 = insertelement <4 x float> %1072, float %644, i64 1
+ %1074 = insertelement <4 x float> %1073, float %645, i64 2
+ %1075 = insertelement <4 x float> %1074, float %646, i64 3
+ %1076 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %930, <4 x float> %1075, i32 0, i32 0, i32 0)
+ %1077 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %931, <4 x float> %1076, i32 0, i32 0, i32 0)
+ %1078 = insertelement <4 x float> poison, float %647, i64 0
+ %1079 = insertelement <4 x float> %1078, float %648, i64 1
+ %1080 = insertelement <4 x float> %1079, float %649, i64 2
+ %1081 = insertelement <4 x float> %1080, float %650, i64 3
+ %1082 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %932, <4 x float> %1081, i32 0, i32 0, i32 0)
+ %1083 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %933, <4 x float> %1082, i32 0, i32 0, i32 0)
+ %1084 = insertelement <4 x float> poison, float %651, i64 0
+ %1085 = insertelement <4 x float> %1084, float %652, i64 1
+ %1086 = insertelement <4 x float> %1085, float %653, i64 2
+ %1087 = insertelement <4 x float> %1086, float %654, i64 3
+ %1088 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %932, <4 x float> %1087, i32 0, i32 0, i32 0)
+ %1089 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %933, <4 x float> %1088, i32 0, i32 0, i32 0)
+ %1090 = shufflevector <8 x half> %1038, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1091 = shufflevector <8 x half> %1038, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1092 = shufflevector <8 x half> %1039, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1093 = shufflevector <8 x half> %1039, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1094 = insertelement <4 x float> poison, float %623, i64 0
+ %1095 = insertelement <4 x float> %1094, float %624, i64 1
+ %1096 = insertelement <4 x float> %1095, float %625, i64 2
+ %1097 = insertelement <4 x float> %1096, float %626, i64 3
+ %1098 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1090, <4 x float> %1097, i32 0, i32 0, i32 0)
+ %1099 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1091, <4 x float> %1098, i32 0, i32 0, i32 0)
+ %1100 = insertelement <4 x float> poison, float %627, i64 0
+ %1101 = insertelement <4 x float> %1100, float %628, i64 1
+ %1102 = insertelement <4 x float> %1101, float %629, i64 2
+ %1103 = insertelement <4 x float> %1102, float %630, i64 3
+ %1104 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1090, <4 x float> %1103, i32 0, i32 0, i32 0)
+ %1105 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1091, <4 x float> %1104, i32 0, i32 0, i32 0)
+ %1106 = insertelement <4 x float> poison, float %631, i64 0
+ %1107 = insertelement <4 x float> %1106, float %632, i64 1
+ %1108 = insertelement <4 x float> %1107, float %633, i64 2
+ %1109 = insertelement <4 x float> %1108, float %634, i64 3
+ %1110 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1092, <4 x float> %1109, i32 0, i32 0, i32 0)
+ %1111 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1093, <4 x float> %1110, i32 0, i32 0, i32 0)
+ %1112 = insertelement <4 x float> poison, float %635, i64 0
+ %1113 = insertelement <4 x float> %1112, float %636, i64 1
+ %1114 = insertelement <4 x float> %1113, float %637, i64 2
+ %1115 = insertelement <4 x float> %1114, float %638, i64 3
+ %1116 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1092, <4 x float> %1115, i32 0, i32 0, i32 0)
+ %1117 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1093, <4 x float> %1116, i32 0, i32 0, i32 0)
+ %1118 = insertelement <4 x float> poison, float %607, i64 0
+ %1119 = insertelement <4 x float> %1118, float %608, i64 1
+ %1120 = insertelement <4 x float> %1119, float %609, i64 2
+ %1121 = insertelement <4 x float> %1120, float %610, i64 3
+ %1122 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1090, <4 x float> %1121, i32 0, i32 0, i32 0)
+ %1123 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1091, <4 x float> %1122, i32 0, i32 0, i32 0)
+ %1124 = insertelement <4 x float> poison, float %611, i64 0
+ %1125 = insertelement <4 x float> %1124, float %612, i64 1
+ %1126 = insertelement <4 x float> %1125, float %613, i64 2
+ %1127 = insertelement <4 x float> %1126, float %614, i64 3
+ %1128 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1090, <4 x float> %1127, i32 0, i32 0, i32 0)
+ %1129 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1091, <4 x float> %1128, i32 0, i32 0, i32 0)
+ %1130 = insertelement <4 x float> poison, float %615, i64 0
+ %1131 = insertelement <4 x float> %1130, float %616, i64 1
+ %1132 = insertelement <4 x float> %1131, float %617, i64 2
+ %1133 = insertelement <4 x float> %1132, float %618, i64 3
+ %1134 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1092, <4 x float> %1133, i32 0, i32 0, i32 0)
+ %1135 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1093, <4 x float> %1134, i32 0, i32 0, i32 0)
+ %1136 = insertelement <4 x float> poison, float %619, i64 0
+ %1137 = insertelement <4 x float> %1136, float %620, i64 1
+ %1138 = insertelement <4 x float> %1137, float %621, i64 2
+ %1139 = insertelement <4 x float> %1138, float %622, i64 3
+ %1140 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1092, <4 x float> %1139, i32 0, i32 0, i32 0)
+ %1141 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1093, <4 x float> %1140, i32 0, i32 0, i32 0)
+ %1142 = shufflevector <8 x half> %1040, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1143 = shufflevector <8 x half> %1040, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1144 = shufflevector <8 x half> %1041, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1145 = shufflevector <8 x half> %1041, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1146 = insertelement <4 x float> poison, float %559, i64 0
+ %1147 = insertelement <4 x float> %1146, float %560, i64 1
+ %1148 = insertelement <4 x float> %1147, float %561, i64 2
+ %1149 = insertelement <4 x float> %1148, float %562, i64 3
+ %1150 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1142, <4 x float> %1149, i32 0, i32 0, i32 0)
+ %1151 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1143, <4 x float> %1150, i32 0, i32 0, i32 0)
+ %1152 = insertelement <4 x float> poison, float %563, i64 0
+ %1153 = insertelement <4 x float> %1152, float %564, i64 1
+ %1154 = insertelement <4 x float> %1153, float %565, i64 2
+ %1155 = insertelement <4 x float> %1154, float %566, i64 3
+ %1156 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1142, <4 x float> %1155, i32 0, i32 0, i32 0)
+ %1157 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1143, <4 x float> %1156, i32 0, i32 0, i32 0)
+ %1158 = insertelement <4 x float> poison, float %567, i64 0
+ %1159 = insertelement <4 x float> %1158, float %568, i64 1
+ %1160 = insertelement <4 x float> %1159, float %569, i64 2
+ %1161 = insertelement <4 x float> %1160, float %570, i64 3
+ %1162 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %870, <4 x half> %1144, <4 x float> %1161, i32 0, i32 0, i32 0)
+ %1163 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %871, <4 x half> %1145, <4 x float> %1162, i32 0, i32 0, i32 0)
+ %1164 = insertelement <4 x float> poison, float %571, i64 0
+ %1165 = insertelement <4 x float> %1164, float %572, i64 1
+ %1166 = insertelement <4 x float> %1165, float %573, i64 2
+ %1167 = insertelement <4 x float> %1166, float %574, i64 3
+ %1168 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %872, <4 x half> %1144, <4 x float> %1167, i32 0, i32 0, i32 0)
+ %1169 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %873, <4 x half> %1145, <4 x float> %1168, i32 0, i32 0, i32 0)
+ %1170 = insertelement <4 x float> poison, float %543, i64 0
+ %1171 = insertelement <4 x float> %1170, float %544, i64 1
+ %1172 = insertelement <4 x float> %1171, float %545, i64 2
+ %1173 = insertelement <4 x float> %1172, float %546, i64 3
+ %1174 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1142, <4 x float> %1173, i32 0, i32 0, i32 0)
+ %1175 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1143, <4 x float> %1174, i32 0, i32 0, i32 0)
+ %1176 = insertelement <4 x float> poison, float %547, i64 0
+ %1177 = insertelement <4 x float> %1176, float %548, i64 1
+ %1178 = insertelement <4 x float> %1177, float %549, i64 2
+ %1179 = insertelement <4 x float> %1178, float %550, i64 3
+ %1180 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1142, <4 x float> %1179, i32 0, i32 0, i32 0)
+ %1181 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1143, <4 x float> %1180, i32 0, i32 0, i32 0)
+ %1182 = insertelement <4 x float> poison, float %551, i64 0
+ %1183 = insertelement <4 x float> %1182, float %552, i64 1
+ %1184 = insertelement <4 x float> %1183, float %553, i64 2
+ %1185 = insertelement <4 x float> %1184, float %554, i64 3
+ %1186 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %898, <4 x half> %1144, <4 x float> %1185, i32 0, i32 0, i32 0)
+ %1187 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %899, <4 x half> %1145, <4 x float> %1186, i32 0, i32 0, i32 0)
+ %1188 = insertelement <4 x float> poison, float %555, i64 0
+ %1189 = insertelement <4 x float> %1188, float %556, i64 1
+ %1190 = insertelement <4 x float> %1189, float %557, i64 2
+ %1191 = insertelement <4 x float> %1190, float %558, i64 3
+ %1192 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %900, <4 x half> %1144, <4 x float> %1191, i32 0, i32 0, i32 0)
+ %1193 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %901, <4 x half> %1145, <4 x float> %1192, i32 0, i32 0, i32 0)
+ %1194 = insertelement <4 x float> poison, float %591, i64 0
+ %1195 = insertelement <4 x float> %1194, float %592, i64 1
+ %1196 = insertelement <4 x float> %1195, float %593, i64 2
+ %1197 = insertelement <4 x float> %1196, float %594, i64 3
+ %1198 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1090, <4 x float> %1197, i32 0, i32 0, i32 0)
+ %1199 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1091, <4 x float> %1198, i32 0, i32 0, i32 0)
+ %1200 = insertelement <4 x float> poison, float %595, i64 0
+ %1201 = insertelement <4 x float> %1200, float %596, i64 1
+ %1202 = insertelement <4 x float> %1201, float %597, i64 2
+ %1203 = insertelement <4 x float> %1202, float %598, i64 3
+ %1204 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1090, <4 x float> %1203, i32 0, i32 0, i32 0)
+ %1205 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1091, <4 x float> %1204, i32 0, i32 0, i32 0)
+ %1206 = insertelement <4 x float> poison, float %599, i64 0
+ %1207 = insertelement <4 x float> %1206, float %600, i64 1
+ %1208 = insertelement <4 x float> %1207, float %601, i64 2
+ %1209 = insertelement <4 x float> %1208, float %602, i64 3
+ %1210 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1092, <4 x float> %1209, i32 0, i32 0, i32 0)
+ %1211 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1093, <4 x float> %1210, i32 0, i32 0, i32 0)
+ %1212 = insertelement <4 x float> poison, float %603, i64 0
+ %1213 = insertelement <4 x float> %1212, float %604, i64 1
+ %1214 = insertelement <4 x float> %1213, float %605, i64 2
+ %1215 = insertelement <4 x float> %1214, float %606, i64 3
+ %1216 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1092, <4 x float> %1215, i32 0, i32 0, i32 0)
+ %1217 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1093, <4 x float> %1216, i32 0, i32 0, i32 0)
+ %1218 = load <8 x half>, ptr addrspace(3) %385, align 16
+ %1219 = load <8 x half>, ptr addrspace(3) %386, align 16
+ %1220 = load <8 x half>, ptr addrspace(3) %388, align 16
+ %1221 = load <8 x half>, ptr addrspace(3) %389, align 16
+ %1222 = insertelement <4 x float> poison, float %575, i64 0
+ %1223 = insertelement <4 x float> %1222, float %576, i64 1
+ %1224 = insertelement <4 x float> %1223, float %577, i64 2
+ %1225 = insertelement <4 x float> %1224, float %578, i64 3
+ %1226 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1090, <4 x float> %1225, i32 0, i32 0, i32 0)
+ %1227 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1091, <4 x float> %1226, i32 0, i32 0, i32 0)
+ %1228 = insertelement <4 x float> poison, float %579, i64 0
+ %1229 = insertelement <4 x float> %1228, float %580, i64 1
+ %1230 = insertelement <4 x float> %1229, float %581, i64 2
+ %1231 = insertelement <4 x float> %1230, float %582, i64 3
+ %1232 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1090, <4 x float> %1231, i32 0, i32 0, i32 0)
+ %1233 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1091, <4 x float> %1232, i32 0, i32 0, i32 0)
+ %1234 = insertelement <4 x float> poison, float %583, i64 0
+ %1235 = insertelement <4 x float> %1234, float %584, i64 1
+ %1236 = insertelement <4 x float> %1235, float %585, i64 2
+ %1237 = insertelement <4 x float> %1236, float %586, i64 3
+ %1238 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1092, <4 x float> %1237, i32 0, i32 0, i32 0)
+ %1239 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1093, <4 x float> %1238, i32 0, i32 0, i32 0)
+ %1240 = insertelement <4 x float> poison, float %587, i64 0
+ %1241 = insertelement <4 x float> %1240, float %588, i64 1
+ %1242 = insertelement <4 x float> %1241, float %589, i64 2
+ %1243 = insertelement <4 x float> %1242, float %590, i64 3
+ %1244 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1092, <4 x float> %1243, i32 0, i32 0, i32 0)
+ %1245 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1093, <4 x float> %1244, i32 0, i32 0, i32 0)
+ %1246 = load <8 x half>, ptr addrspace(3) %391, align 16
+ %1247 = load <8 x half>, ptr addrspace(3) %392, align 16
+ %1248 = load <8 x half>, ptr addrspace(3) %395, align 16
+ %1249 = load <8 x half>, ptr addrspace(3) %396, align 16
+ %1250 = insertelement <4 x float> poison, float %527, i64 0
+ %1251 = insertelement <4 x float> %1250, float %528, i64 1
+ %1252 = insertelement <4 x float> %1251, float %529, i64 2
+ %1253 = insertelement <4 x float> %1252, float %530, i64 3
+ %1254 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1142, <4 x float> %1253, i32 0, i32 0, i32 0)
+ %1255 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1143, <4 x float> %1254, i32 0, i32 0, i32 0)
+ %1256 = insertelement <4 x float> poison, float %531, i64 0
+ %1257 = insertelement <4 x float> %1256, float %532, i64 1
+ %1258 = insertelement <4 x float> %1257, float %533, i64 2
+ %1259 = insertelement <4 x float> %1258, float %534, i64 3
+ %1260 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1142, <4 x float> %1259, i32 0, i32 0, i32 0)
+ %1261 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1143, <4 x float> %1260, i32 0, i32 0, i32 0)
+ %1262 = insertelement <4 x float> poison, float %535, i64 0
+ %1263 = insertelement <4 x float> %1262, float %536, i64 1
+ %1264 = insertelement <4 x float> %1263, float %537, i64 2
+ %1265 = insertelement <4 x float> %1264, float %538, i64 3
+ %1266 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %982, <4 x half> %1144, <4 x float> %1265, i32 0, i32 0, i32 0)
+ %1267 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %983, <4 x half> %1145, <4 x float> %1266, i32 0, i32 0, i32 0)
+ %1268 = insertelement <4 x float> poison, float %539, i64 0
+ %1269 = insertelement <4 x float> %1268, float %540, i64 1
+ %1270 = insertelement <4 x float> %1269, float %541, i64 2
+ %1271 = insertelement <4 x float> %1270, float %542, i64 3
+ %1272 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %984, <4 x half> %1144, <4 x float> %1271, i32 0, i32 0, i32 0)
+ %1273 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %985, <4 x half> %1145, <4 x float> %1272, i32 0, i32 0, i32 0)
+ %1274 = insertelement <4 x float> poison, float %511, i64 0
+ %1275 = insertelement <4 x float> %1274, float %512, i64 1
+ %1276 = insertelement <4 x float> %1275, float %513, i64 2
+ %1277 = insertelement <4 x float> %1276, float %514, i64 3
+ %1278 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1142, <4 x float> %1277, i32 0, i32 0, i32 0)
+ %1279 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1143, <4 x float> %1278, i32 0, i32 0, i32 0)
+ %1280 = insertelement <4 x float> poison, float %515, i64 0
+ %1281 = insertelement <4 x float> %1280, float %516, i64 1
+ %1282 = insertelement <4 x float> %1281, float %517, i64 2
+ %1283 = insertelement <4 x float> %1282, float %518, i64 3
+ %1284 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1142, <4 x float> %1283, i32 0, i32 0, i32 0)
+ %1285 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1143, <4 x float> %1284, i32 0, i32 0, i32 0)
+ %1286 = insertelement <4 x float> poison, float %519, i64 0
+ %1287 = insertelement <4 x float> %1286, float %520, i64 1
+ %1288 = insertelement <4 x float> %1287, float %521, i64 2
+ %1289 = insertelement <4 x float> %1288, float %522, i64 3
+ %1290 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1010, <4 x half> %1144, <4 x float> %1289, i32 0, i32 0, i32 0)
+ %1291 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1011, <4 x half> %1145, <4 x float> %1290, i32 0, i32 0, i32 0)
+ %1292 = insertelement <4 x float> poison, float %523, i64 0
+ %1293 = insertelement <4 x float> %1292, float %524, i64 1
+ %1294 = insertelement <4 x float> %1293, float %525, i64 2
+ %1295 = insertelement <4 x float> %1294, float %526, i64 3
+ %1296 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1012, <4 x half> %1144, <4 x float> %1295, i32 0, i32 0, i32 0)
+ %1297 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1013, <4 x half> %1145, <4 x float> %1296, i32 0, i32 0, i32 0)
+ %1298 = shufflevector <8 x half> %1218, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1299 = shufflevector <8 x half> %1218, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1300 = shufflevector <8 x half> %1219, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1301 = shufflevector <8 x half> %1219, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1302 = shufflevector <8 x half> %1220, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1303 = shufflevector <8 x half> %1220, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1304 = shufflevector <8 x half> %1221, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1305 = shufflevector <8 x half> %1221, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1306 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1298, <4 x float> %879, i32 0, i32 0, i32 0)
+ %1307 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1299, <4 x float> %1306, i32 0, i32 0, i32 0)
+ %1308 = extractelement <4 x float> %1307, i64 0
+ %1309 = extractelement <4 x float> %1307, i64 1
+ %1310 = extractelement <4 x float> %1307, i64 2
+ %1311 = extractelement <4 x float> %1307, i64 3
+ %1312 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1298, <4 x float> %885, i32 0, i32 0, i32 0)
+ %1313 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1299, <4 x float> %1312, i32 0, i32 0, i32 0)
+ %1314 = extractelement <4 x float> %1313, i64 0
+ %1315 = extractelement <4 x float> %1313, i64 1
+ %1316 = extractelement <4 x float> %1313, i64 2
+ %1317 = extractelement <4 x float> %1313, i64 3
+ %1318 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1300, <4 x float> %891, i32 0, i32 0, i32 0)
+ %1319 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1301, <4 x float> %1318, i32 0, i32 0, i32 0)
+ %1320 = extractelement <4 x float> %1319, i64 0
+ %1321 = extractelement <4 x float> %1319, i64 1
+ %1322 = extractelement <4 x float> %1319, i64 2
+ %1323 = extractelement <4 x float> %1319, i64 3
+ %1324 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1300, <4 x float> %897, i32 0, i32 0, i32 0)
+ %1325 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1301, <4 x float> %1324, i32 0, i32 0, i32 0)
+ %1326 = extractelement <4 x float> %1325, i64 0
+ %1327 = extractelement <4 x float> %1325, i64 1
+ %1328 = extractelement <4 x float> %1325, i64 2
+ %1329 = extractelement <4 x float> %1325, i64 3
+ %1330 = load <8 x half>, ptr addrspace(3) %399, align 16
+ %1331 = load <8 x half>, ptr addrspace(3) %400, align 16
+ %1332 = load <8 x half>, ptr addrspace(3) %403, align 16
+ %1333 = load <8 x half>, ptr addrspace(3) %404, align 16
+ %1334 = shufflevector <8 x half> %1246, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1335 = shufflevector <8 x half> %1246, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1336 = shufflevector <8 x half> %1247, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1337 = shufflevector <8 x half> %1247, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1338 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1298, <4 x float> %907, i32 0, i32 0, i32 0)
+ %1339 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1299, <4 x float> %1338, i32 0, i32 0, i32 0)
+ %1340 = extractelement <4 x float> %1339, i64 0
+ %1341 = extractelement <4 x float> %1339, i64 1
+ %1342 = extractelement <4 x float> %1339, i64 2
+ %1343 = extractelement <4 x float> %1339, i64 3
+ %1344 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1298, <4 x float> %913, i32 0, i32 0, i32 0)
+ %1345 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1299, <4 x float> %1344, i32 0, i32 0, i32 0)
+ %1346 = extractelement <4 x float> %1345, i64 0
+ %1347 = extractelement <4 x float> %1345, i64 1
+ %1348 = extractelement <4 x float> %1345, i64 2
+ %1349 = extractelement <4 x float> %1345, i64 3
+ %1350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1300, <4 x float> %919, i32 0, i32 0, i32 0)
+ %1351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1301, <4 x float> %1350, i32 0, i32 0, i32 0)
+ %1352 = extractelement <4 x float> %1351, i64 0
+ %1353 = extractelement <4 x float> %1351, i64 1
+ %1354 = extractelement <4 x float> %1351, i64 2
+ %1355 = extractelement <4 x float> %1351, i64 3
+ %1356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1300, <4 x float> %925, i32 0, i32 0, i32 0)
+ %1357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1301, <4 x float> %1356, i32 0, i32 0, i32 0)
+ %1358 = extractelement <4 x float> %1357, i64 0
+ %1359 = extractelement <4 x float> %1357, i64 1
+ %1360 = extractelement <4 x float> %1357, i64 2
+ %1361 = extractelement <4 x float> %1357, i64 3
+ %1362 = load <8 x half>, ptr addrspace(3) %407, align 16
+ %1363 = load <8 x half>, ptr addrspace(3) %408, align 16
+ %1364 = load <8 x half>, ptr addrspace(3) %411, align 16
+ %1365 = load <8 x half>, ptr addrspace(3) %412, align 16
+ %1366 = shufflevector <8 x half> %1248, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1367 = shufflevector <8 x half> %1248, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1368 = shufflevector <8 x half> %1249, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1369 = shufflevector <8 x half> %1249, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1370 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1366, <4 x float> %939, i32 0, i32 0, i32 0)
+ %1371 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1367, <4 x float> %1370, i32 0, i32 0, i32 0)
+ %1372 = extractelement <4 x float> %1371, i64 0
+ %1373 = extractelement <4 x float> %1371, i64 1
+ %1374 = extractelement <4 x float> %1371, i64 2
+ %1375 = extractelement <4 x float> %1371, i64 3
+ %1376 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1366, <4 x float> %945, i32 0, i32 0, i32 0)
+ %1377 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1367, <4 x float> %1376, i32 0, i32 0, i32 0)
+ %1378 = extractelement <4 x float> %1377, i64 0
+ %1379 = extractelement <4 x float> %1377, i64 1
+ %1380 = extractelement <4 x float> %1377, i64 2
+ %1381 = extractelement <4 x float> %1377, i64 3
+ %1382 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1368, <4 x float> %951, i32 0, i32 0, i32 0)
+ %1383 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1369, <4 x float> %1382, i32 0, i32 0, i32 0)
+ %1384 = extractelement <4 x float> %1383, i64 0
+ %1385 = extractelement <4 x float> %1383, i64 1
+ %1386 = extractelement <4 x float> %1383, i64 2
+ %1387 = extractelement <4 x float> %1383, i64 3
+ %1388 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1368, <4 x float> %957, i32 0, i32 0, i32 0)
+ %1389 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1369, <4 x float> %1388, i32 0, i32 0, i32 0)
+ %1390 = extractelement <4 x float> %1389, i64 0
+ %1391 = extractelement <4 x float> %1389, i64 1
+ %1392 = extractelement <4 x float> %1389, i64 2
+ %1393 = extractelement <4 x float> %1389, i64 3
+ %1394 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1366, <4 x float> %963, i32 0, i32 0, i32 0)
+ %1395 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1367, <4 x float> %1394, i32 0, i32 0, i32 0)
+ %1396 = extractelement <4 x float> %1395, i64 0
+ %1397 = extractelement <4 x float> %1395, i64 1
+ %1398 = extractelement <4 x float> %1395, i64 2
+ %1399 = extractelement <4 x float> %1395, i64 3
+ %1400 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1366, <4 x float> %969, i32 0, i32 0, i32 0)
+ %1401 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1367, <4 x float> %1400, i32 0, i32 0, i32 0)
+ %1402 = extractelement <4 x float> %1401, i64 0
+ %1403 = extractelement <4 x float> %1401, i64 1
+ %1404 = extractelement <4 x float> %1401, i64 2
+ %1405 = extractelement <4 x float> %1401, i64 3
+ %1406 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1368, <4 x float> %975, i32 0, i32 0, i32 0)
+ %1407 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1369, <4 x float> %1406, i32 0, i32 0, i32 0)
+ %1408 = extractelement <4 x float> %1407, i64 0
+ %1409 = extractelement <4 x float> %1407, i64 1
+ %1410 = extractelement <4 x float> %1407, i64 2
+ %1411 = extractelement <4 x float> %1407, i64 3
+ %1412 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1368, <4 x float> %981, i32 0, i32 0, i32 0)
+ %1413 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1369, <4 x float> %1412, i32 0, i32 0, i32 0)
+ %1414 = extractelement <4 x float> %1413, i64 0
+ %1415 = extractelement <4 x float> %1413, i64 1
+ %1416 = extractelement <4 x float> %1413, i64 2
+ %1417 = extractelement <4 x float> %1413, i64 3
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %1418 = shufflevector <2 x half> %865, <2 x half> %864, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1419 = shufflevector <2 x half> %863, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1420 = shufflevector <8 x half> %1418, <8 x half> %1419, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1421 = shufflevector <2 x half> %862, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1422 = shufflevector <8 x half> %1420, <8 x half> %1421, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1422, ptr addrspace(3) %199, align 16
+ %1423 = shufflevector <2 x half> %861, <2 x half> %860, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1424 = shufflevector <2 x half> %859, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1425 = shufflevector <8 x half> %1423, <8 x half> %1424, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1426 = shufflevector <2 x half> %858, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1427 = shufflevector <8 x half> %1425, <8 x half> %1426, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1427, ptr addrspace(3) %201, align 16
+ %1428 = shufflevector <2 x half> %857, <2 x half> %856, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1429 = shufflevector <2 x half> %855, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1430 = shufflevector <8 x half> %1428, <8 x half> %1429, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1431 = shufflevector <2 x half> %854, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1432 = shufflevector <8 x half> %1430, <8 x half> %1431, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1432, ptr addrspace(3) %203, align 16
+ %1433 = shufflevector <2 x half> %853, <2 x half> %852, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1434 = shufflevector <2 x half> %851, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1435 = shufflevector <8 x half> %1433, <8 x half> %1434, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1436 = shufflevector <2 x half> %850, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1437 = shufflevector <8 x half> %1435, <8 x half> %1436, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1437, ptr addrspace(3) %205, align 16
+ %1438 = shufflevector <2 x half> %849, <2 x half> %848, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1439 = shufflevector <2 x half> %847, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1440 = shufflevector <8 x half> %1438, <8 x half> %1439, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1441 = shufflevector <2 x half> %846, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1442 = shufflevector <8 x half> %1440, <8 x half> %1441, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1442, ptr addrspace(3) %207, align 16
+ %1443 = shufflevector <2 x half> %845, <2 x half> %844, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1444 = shufflevector <2 x half> %843, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1445 = shufflevector <8 x half> %1443, <8 x half> %1444, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1446 = shufflevector <2 x half> %842, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1447 = shufflevector <8 x half> %1445, <8 x half> %1446, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1447, ptr addrspace(3) %209, align 16
+ %1448 = shufflevector <2 x half> %841, <2 x half> %840, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1449 = shufflevector <2 x half> %839, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1450 = shufflevector <8 x half> %1448, <8 x half> %1449, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1451 = shufflevector <2 x half> %838, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1452 = shufflevector <8 x half> %1450, <8 x half> %1451, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1452, ptr addrspace(3) %211, align 16
+ %1453 = shufflevector <2 x half> %837, <2 x half> %836, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1454 = shufflevector <2 x half> %835, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1455 = shufflevector <8 x half> %1453, <8 x half> %1454, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1456 = shufflevector <2 x half> %834, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1457 = shufflevector <8 x half> %1455, <8 x half> %1456, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1457, ptr addrspace(3) %213, align 16
+ %1458 = getelementptr i8, ptr addrspace(1) %752, i64 128
+ %1459 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %1458, i16 0, i32 2147483646, i32 159744)
+ %1460 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %68, i32 0, i32 0)
+ %1461 = bitcast <4 x i32> %1460 to <8 x half>
+ %1462 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %71, i32 0, i32 0)
+ %1463 = bitcast <4 x i32> %1462 to <8 x half>
+ %1464 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %74, i32 0, i32 0)
+ %1465 = bitcast <4 x i32> %1464 to <8 x half>
+ %1466 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %77, i32 0, i32 0)
+ %1467 = bitcast <4 x i32> %1466 to <8 x half>
+ %1468 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %80, i32 0, i32 0)
+ %1469 = bitcast <4 x i32> %1468 to <8 x half>
+ %1470 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %83, i32 0, i32 0)
+ %1471 = bitcast <4 x i32> %1470 to <8 x half>
+ %1472 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %86, i32 0, i32 0)
+ %1473 = bitcast <4 x i32> %1472 to <8 x half>
+ %1474 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1459, i32 %89, i32 0, i32 0)
+ %1475 = bitcast <4 x i32> %1474 to <8 x half>
+ %1476 = shufflevector <8 x half> %1330, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1477 = shufflevector <8 x half> %1330, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1478 = shufflevector <8 x half> %1331, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1479 = shufflevector <8 x half> %1331, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1480 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1298, <4 x float> %991, i32 0, i32 0, i32 0)
+ %1481 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1299, <4 x float> %1480, i32 0, i32 0, i32 0)
+ %1482 = extractelement <4 x float> %1481, i64 0
+ %1483 = extractelement <4 x float> %1481, i64 1
+ %1484 = extractelement <4 x float> %1481, i64 2
+ %1485 = extractelement <4 x float> %1481, i64 3
+ %1486 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1298, <4 x float> %997, i32 0, i32 0, i32 0)
+ %1487 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1299, <4 x float> %1486, i32 0, i32 0, i32 0)
+ %1488 = extractelement <4 x float> %1487, i64 0
+ %1489 = extractelement <4 x float> %1487, i64 1
+ %1490 = extractelement <4 x float> %1487, i64 2
+ %1491 = extractelement <4 x float> %1487, i64 3
+ %1492 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1300, <4 x float> %1003, i32 0, i32 0, i32 0)
+ %1493 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1301, <4 x float> %1492, i32 0, i32 0, i32 0)
+ %1494 = extractelement <4 x float> %1493, i64 0
+ %1495 = extractelement <4 x float> %1493, i64 1
+ %1496 = extractelement <4 x float> %1493, i64 2
+ %1497 = extractelement <4 x float> %1493, i64 3
+ %1498 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1300, <4 x float> %1009, i32 0, i32 0, i32 0)
+ %1499 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1301, <4 x float> %1498, i32 0, i32 0, i32 0)
+ %1500 = extractelement <4 x float> %1499, i64 0
+ %1501 = extractelement <4 x float> %1499, i64 1
+ %1502 = extractelement <4 x float> %1499, i64 2
+ %1503 = extractelement <4 x float> %1499, i64 3
+ %1504 = shufflevector <8 x half> %1332, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1505 = shufflevector <8 x half> %1332, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1506 = shufflevector <8 x half> %1333, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1507 = shufflevector <8 x half> %1333, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1508 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1298, <4 x float> %1019, i32 0, i32 0, i32 0)
+ %1509 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1299, <4 x float> %1508, i32 0, i32 0, i32 0)
+ %1510 = extractelement <4 x float> %1509, i64 0
+ %1511 = extractelement <4 x float> %1509, i64 1
+ %1512 = extractelement <4 x float> %1509, i64 2
+ %1513 = extractelement <4 x float> %1509, i64 3
+ %1514 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1298, <4 x float> %1025, i32 0, i32 0, i32 0)
+ %1515 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1299, <4 x float> %1514, i32 0, i32 0, i32 0)
+ %1516 = extractelement <4 x float> %1515, i64 0
+ %1517 = extractelement <4 x float> %1515, i64 1
+ %1518 = extractelement <4 x float> %1515, i64 2
+ %1519 = extractelement <4 x float> %1515, i64 3
+ %1520 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1300, <4 x float> %1031, i32 0, i32 0, i32 0)
+ %1521 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1301, <4 x float> %1520, i32 0, i32 0, i32 0)
+ %1522 = extractelement <4 x float> %1521, i64 0
+ %1523 = extractelement <4 x float> %1521, i64 1
+ %1524 = extractelement <4 x float> %1521, i64 2
+ %1525 = extractelement <4 x float> %1521, i64 3
+ %1526 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1300, <4 x float> %1037, i32 0, i32 0, i32 0)
+ %1527 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1301, <4 x float> %1526, i32 0, i32 0, i32 0)
+ %1528 = extractelement <4 x float> %1527, i64 0
+ %1529 = extractelement <4 x float> %1527, i64 1
+ %1530 = extractelement <4 x float> %1527, i64 2
+ %1531 = extractelement <4 x float> %1527, i64 3
+ %1532 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1366, <4 x float> %1047, i32 0, i32 0, i32 0)
+ %1533 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1367, <4 x float> %1532, i32 0, i32 0, i32 0)
+ %1534 = extractelement <4 x float> %1533, i64 0
+ %1535 = extractelement <4 x float> %1533, i64 1
+ %1536 = extractelement <4 x float> %1533, i64 2
+ %1537 = extractelement <4 x float> %1533, i64 3
+ %1538 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1366, <4 x float> %1053, i32 0, i32 0, i32 0)
+ %1539 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1367, <4 x float> %1538, i32 0, i32 0, i32 0)
+ %1540 = extractelement <4 x float> %1539, i64 0
+ %1541 = extractelement <4 x float> %1539, i64 1
+ %1542 = extractelement <4 x float> %1539, i64 2
+ %1543 = extractelement <4 x float> %1539, i64 3
+ %1544 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1368, <4 x float> %1059, i32 0, i32 0, i32 0)
+ %1545 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1369, <4 x float> %1544, i32 0, i32 0, i32 0)
+ %1546 = extractelement <4 x float> %1545, i64 0
+ %1547 = extractelement <4 x float> %1545, i64 1
+ %1548 = extractelement <4 x float> %1545, i64 2
+ %1549 = extractelement <4 x float> %1545, i64 3
+ %1550 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1368, <4 x float> %1065, i32 0, i32 0, i32 0)
+ %1551 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1369, <4 x float> %1550, i32 0, i32 0, i32 0)
+ %1552 = extractelement <4 x float> %1551, i64 0
+ %1553 = extractelement <4 x float> %1551, i64 1
+ %1554 = extractelement <4 x float> %1551, i64 2
+ %1555 = extractelement <4 x float> %1551, i64 3
+ %1556 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1366, <4 x float> %1071, i32 0, i32 0, i32 0)
+ %1557 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1367, <4 x float> %1556, i32 0, i32 0, i32 0)
+ %1558 = extractelement <4 x float> %1557, i64 0
+ %1559 = extractelement <4 x float> %1557, i64 1
+ %1560 = extractelement <4 x float> %1557, i64 2
+ %1561 = extractelement <4 x float> %1557, i64 3
+ %1562 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1366, <4 x float> %1077, i32 0, i32 0, i32 0)
+ %1563 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1367, <4 x float> %1562, i32 0, i32 0, i32 0)
+ %1564 = extractelement <4 x float> %1563, i64 0
+ %1565 = extractelement <4 x float> %1563, i64 1
+ %1566 = extractelement <4 x float> %1563, i64 2
+ %1567 = extractelement <4 x float> %1563, i64 3
+ %1568 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1368, <4 x float> %1083, i32 0, i32 0, i32 0)
+ %1569 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1369, <4 x float> %1568, i32 0, i32 0, i32 0)
+ %1570 = extractelement <4 x float> %1569, i64 0
+ %1571 = extractelement <4 x float> %1569, i64 1
+ %1572 = extractelement <4 x float> %1569, i64 2
+ %1573 = extractelement <4 x float> %1569, i64 3
+ %1574 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1368, <4 x float> %1089, i32 0, i32 0, i32 0)
+ %1575 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1369, <4 x float> %1574, i32 0, i32 0, i32 0)
+ %1576 = extractelement <4 x float> %1575, i64 0
+ %1577 = extractelement <4 x float> %1575, i64 1
+ %1578 = extractelement <4 x float> %1575, i64 2
+ %1579 = extractelement <4 x float> %1575, i64 3
+ %1580 = shufflevector <2 x half> %833, <2 x half> %832, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1581 = shufflevector <2 x half> %831, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1582 = shufflevector <8 x half> %1580, <8 x half> %1581, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1583 = shufflevector <2 x half> %830, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1584 = shufflevector <8 x half> %1582, <8 x half> %1583, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1584, ptr addrspace(3) %214, align 16
+ %1585 = shufflevector <2 x half> %829, <2 x half> %828, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1586 = shufflevector <2 x half> %827, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1587 = shufflevector <8 x half> %1585, <8 x half> %1586, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1588 = shufflevector <2 x half> %826, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1589 = shufflevector <8 x half> %1587, <8 x half> %1588, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1589, ptr addrspace(3) %215, align 16
+ %1590 = shufflevector <2 x half> %825, <2 x half> %824, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1591 = shufflevector <2 x half> %823, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1592 = shufflevector <8 x half> %1590, <8 x half> %1591, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1593 = shufflevector <2 x half> %822, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1594 = shufflevector <8 x half> %1592, <8 x half> %1593, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1594, ptr addrspace(3) %216, align 16
+ %1595 = shufflevector <2 x half> %821, <2 x half> %820, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1596 = shufflevector <2 x half> %819, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1597 = shufflevector <8 x half> %1595, <8 x half> %1596, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1598 = shufflevector <2 x half> %818, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1599 = shufflevector <8 x half> %1597, <8 x half> %1598, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1599, ptr addrspace(3) %217, align 16
+ %1600 = shufflevector <2 x half> %817, <2 x half> %816, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1601 = shufflevector <2 x half> %815, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1602 = shufflevector <8 x half> %1600, <8 x half> %1601, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1603 = shufflevector <2 x half> %814, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1604 = shufflevector <8 x half> %1602, <8 x half> %1603, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1604, ptr addrspace(3) %218, align 16
+ %1605 = shufflevector <2 x half> %813, <2 x half> %812, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1606 = shufflevector <2 x half> %811, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1607 = shufflevector <8 x half> %1605, <8 x half> %1606, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1608 = shufflevector <2 x half> %810, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1609 = shufflevector <8 x half> %1607, <8 x half> %1608, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1609, ptr addrspace(3) %219, align 16
+ %1610 = shufflevector <2 x half> %809, <2 x half> %808, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1611 = shufflevector <2 x half> %807, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1612 = shufflevector <8 x half> %1610, <8 x half> %1611, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1613 = shufflevector <2 x half> %806, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1614 = shufflevector <8 x half> %1612, <8 x half> %1613, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1614, ptr addrspace(3) %220, align 16
+ %1615 = shufflevector <2 x half> %805, <2 x half> %804, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1616 = shufflevector <2 x half> %803, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1617 = shufflevector <8 x half> %1615, <8 x half> %1616, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %1618 = shufflevector <2 x half> %802, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %1619 = shufflevector <8 x half> %1617, <8 x half> %1618, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %1619, ptr addrspace(3) %221, align 16
+ %1620 = getelementptr i8, ptr addrspace(1) %751, i64 128
+ %1621 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %1620, i16 0, i32 2147483646, i32 159744)
+ %1622 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %140, i32 0, i32 0)
+ %1623 = bitcast <4 x i32> %1622 to <8 x half>
+ %1624 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %143, i32 0, i32 0)
+ %1625 = bitcast <4 x i32> %1624 to <8 x half>
+ %1626 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %146, i32 0, i32 0)
+ %1627 = bitcast <4 x i32> %1626 to <8 x half>
+ %1628 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %149, i32 0, i32 0)
+ %1629 = bitcast <4 x i32> %1628 to <8 x half>
+ %1630 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %152, i32 0, i32 0)
+ %1631 = bitcast <4 x i32> %1630 to <8 x half>
+ %1632 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %155, i32 0, i32 0)
+ %1633 = bitcast <4 x i32> %1632 to <8 x half>
+ %1634 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %158, i32 0, i32 0)
+ %1635 = bitcast <4 x i32> %1634 to <8 x half>
+ %1636 = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %1621, i32 %161, i32 0, i32 0)
+ %1637 = bitcast <4 x i32> %1636 to <8 x half>
+ %1638 = shufflevector <8 x half> %1362, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1639 = shufflevector <8 x half> %1362, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1640 = shufflevector <8 x half> %1363, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1641 = shufflevector <8 x half> %1363, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1642 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1638, <4 x float> %1099, i32 0, i32 0, i32 0)
+ %1643 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1639, <4 x float> %1642, i32 0, i32 0, i32 0)
+ %1644 = extractelement <4 x float> %1643, i64 0
+ %1645 = extractelement <4 x float> %1643, i64 1
+ %1646 = extractelement <4 x float> %1643, i64 2
+ %1647 = extractelement <4 x float> %1643, i64 3
+ %1648 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1638, <4 x float> %1105, i32 0, i32 0, i32 0)
+ %1649 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1639, <4 x float> %1648, i32 0, i32 0, i32 0)
+ %1650 = extractelement <4 x float> %1649, i64 0
+ %1651 = extractelement <4 x float> %1649, i64 1
+ %1652 = extractelement <4 x float> %1649, i64 2
+ %1653 = extractelement <4 x float> %1649, i64 3
+ %1654 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1640, <4 x float> %1111, i32 0, i32 0, i32 0)
+ %1655 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1641, <4 x float> %1654, i32 0, i32 0, i32 0)
+ %1656 = extractelement <4 x float> %1655, i64 0
+ %1657 = extractelement <4 x float> %1655, i64 1
+ %1658 = extractelement <4 x float> %1655, i64 2
+ %1659 = extractelement <4 x float> %1655, i64 3
+ %1660 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1640, <4 x float> %1117, i32 0, i32 0, i32 0)
+ %1661 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1641, <4 x float> %1660, i32 0, i32 0, i32 0)
+ %1662 = extractelement <4 x float> %1661, i64 0
+ %1663 = extractelement <4 x float> %1661, i64 1
+ %1664 = extractelement <4 x float> %1661, i64 2
+ %1665 = extractelement <4 x float> %1661, i64 3
+ %1666 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1638, <4 x float> %1123, i32 0, i32 0, i32 0)
+ %1667 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1639, <4 x float> %1666, i32 0, i32 0, i32 0)
+ %1668 = extractelement <4 x float> %1667, i64 0
+ %1669 = extractelement <4 x float> %1667, i64 1
+ %1670 = extractelement <4 x float> %1667, i64 2
+ %1671 = extractelement <4 x float> %1667, i64 3
+ %1672 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1638, <4 x float> %1129, i32 0, i32 0, i32 0)
+ %1673 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1639, <4 x float> %1672, i32 0, i32 0, i32 0)
+ %1674 = extractelement <4 x float> %1673, i64 0
+ %1675 = extractelement <4 x float> %1673, i64 1
+ %1676 = extractelement <4 x float> %1673, i64 2
+ %1677 = extractelement <4 x float> %1673, i64 3
+ %1678 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1640, <4 x float> %1135, i32 0, i32 0, i32 0)
+ %1679 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1641, <4 x float> %1678, i32 0, i32 0, i32 0)
+ %1680 = extractelement <4 x float> %1679, i64 0
+ %1681 = extractelement <4 x float> %1679, i64 1
+ %1682 = extractelement <4 x float> %1679, i64 2
+ %1683 = extractelement <4 x float> %1679, i64 3
+ %1684 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1640, <4 x float> %1141, i32 0, i32 0, i32 0)
+ %1685 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1641, <4 x float> %1684, i32 0, i32 0, i32 0)
+ %1686 = extractelement <4 x float> %1685, i64 0
+ %1687 = extractelement <4 x float> %1685, i64 1
+ %1688 = extractelement <4 x float> %1685, i64 2
+ %1689 = extractelement <4 x float> %1685, i64 3
+ %1690 = shufflevector <8 x half> %1364, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1691 = shufflevector <8 x half> %1364, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1692 = shufflevector <8 x half> %1365, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1693 = shufflevector <8 x half> %1365, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1694 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1690, <4 x float> %1151, i32 0, i32 0, i32 0)
+ %1695 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1691, <4 x float> %1694, i32 0, i32 0, i32 0)
+ %1696 = extractelement <4 x float> %1695, i64 0
+ %1697 = extractelement <4 x float> %1695, i64 1
+ %1698 = extractelement <4 x float> %1695, i64 2
+ %1699 = extractelement <4 x float> %1695, i64 3
+ %1700 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1690, <4 x float> %1157, i32 0, i32 0, i32 0)
+ %1701 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1691, <4 x float> %1700, i32 0, i32 0, i32 0)
+ %1702 = extractelement <4 x float> %1701, i64 0
+ %1703 = extractelement <4 x float> %1701, i64 1
+ %1704 = extractelement <4 x float> %1701, i64 2
+ %1705 = extractelement <4 x float> %1701, i64 3
+ %1706 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1302, <4 x half> %1692, <4 x float> %1163, i32 0, i32 0, i32 0)
+ %1707 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1303, <4 x half> %1693, <4 x float> %1706, i32 0, i32 0, i32 0)
+ %1708 = extractelement <4 x float> %1707, i64 0
+ %1709 = extractelement <4 x float> %1707, i64 1
+ %1710 = extractelement <4 x float> %1707, i64 2
+ %1711 = extractelement <4 x float> %1707, i64 3
+ %1712 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1304, <4 x half> %1692, <4 x float> %1169, i32 0, i32 0, i32 0)
+ %1713 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1305, <4 x half> %1693, <4 x float> %1712, i32 0, i32 0, i32 0)
+ %1714 = extractelement <4 x float> %1713, i64 0
+ %1715 = extractelement <4 x float> %1713, i64 1
+ %1716 = extractelement <4 x float> %1713, i64 2
+ %1717 = extractelement <4 x float> %1713, i64 3
+ %1718 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1690, <4 x float> %1175, i32 0, i32 0, i32 0)
+ %1719 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1691, <4 x float> %1718, i32 0, i32 0, i32 0)
+ %1720 = extractelement <4 x float> %1719, i64 0
+ %1721 = extractelement <4 x float> %1719, i64 1
+ %1722 = extractelement <4 x float> %1719, i64 2
+ %1723 = extractelement <4 x float> %1719, i64 3
+ %1724 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1690, <4 x float> %1181, i32 0, i32 0, i32 0)
+ %1725 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1691, <4 x float> %1724, i32 0, i32 0, i32 0)
+ %1726 = extractelement <4 x float> %1725, i64 0
+ %1727 = extractelement <4 x float> %1725, i64 1
+ %1728 = extractelement <4 x float> %1725, i64 2
+ %1729 = extractelement <4 x float> %1725, i64 3
+ %1730 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1334, <4 x half> %1692, <4 x float> %1187, i32 0, i32 0, i32 0)
+ %1731 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1335, <4 x half> %1693, <4 x float> %1730, i32 0, i32 0, i32 0)
+ %1732 = extractelement <4 x float> %1731, i64 0
+ %1733 = extractelement <4 x float> %1731, i64 1
+ %1734 = extractelement <4 x float> %1731, i64 2
+ %1735 = extractelement <4 x float> %1731, i64 3
+ %1736 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1336, <4 x half> %1692, <4 x float> %1193, i32 0, i32 0, i32 0)
+ %1737 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1337, <4 x half> %1693, <4 x float> %1736, i32 0, i32 0, i32 0)
+ %1738 = extractelement <4 x float> %1737, i64 0
+ %1739 = extractelement <4 x float> %1737, i64 1
+ %1740 = extractelement <4 x float> %1737, i64 2
+ %1741 = extractelement <4 x float> %1737, i64 3
+ %1742 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1638, <4 x float> %1199, i32 0, i32 0, i32 0)
+ %1743 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1639, <4 x float> %1742, i32 0, i32 0, i32 0)
+ %1744 = extractelement <4 x float> %1743, i64 0
+ %1745 = extractelement <4 x float> %1743, i64 1
+ %1746 = extractelement <4 x float> %1743, i64 2
+ %1747 = extractelement <4 x float> %1743, i64 3
+ %1748 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1638, <4 x float> %1205, i32 0, i32 0, i32 0)
+ %1749 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1639, <4 x float> %1748, i32 0, i32 0, i32 0)
+ %1750 = extractelement <4 x float> %1749, i64 0
+ %1751 = extractelement <4 x float> %1749, i64 1
+ %1752 = extractelement <4 x float> %1749, i64 2
+ %1753 = extractelement <4 x float> %1749, i64 3
+ %1754 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1640, <4 x float> %1211, i32 0, i32 0, i32 0)
+ %1755 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1641, <4 x float> %1754, i32 0, i32 0, i32 0)
+ %1756 = extractelement <4 x float> %1755, i64 0
+ %1757 = extractelement <4 x float> %1755, i64 1
+ %1758 = extractelement <4 x float> %1755, i64 2
+ %1759 = extractelement <4 x float> %1755, i64 3
+ %1760 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1640, <4 x float> %1217, i32 0, i32 0, i32 0)
+ %1761 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1641, <4 x float> %1760, i32 0, i32 0, i32 0)
+ %1762 = extractelement <4 x float> %1761, i64 0
+ %1763 = extractelement <4 x float> %1761, i64 1
+ %1764 = extractelement <4 x float> %1761, i64 2
+ %1765 = extractelement <4 x float> %1761, i64 3
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %1766 = load <8 x half>, ptr addrspace(3) %233, align 16
+ %1767 = load <8 x half>, ptr addrspace(3) %235, align 16
+ %1768 = load <8 x half>, ptr addrspace(3) %243, align 16
+ %1769 = load <8 x half>, ptr addrspace(3) %245, align 16
+ %1770 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1638, <4 x float> %1227, i32 0, i32 0, i32 0)
+ %1771 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1639, <4 x float> %1770, i32 0, i32 0, i32 0)
+ %1772 = extractelement <4 x float> %1771, i64 0
+ %1773 = extractelement <4 x float> %1771, i64 1
+ %1774 = extractelement <4 x float> %1771, i64 2
+ %1775 = extractelement <4 x float> %1771, i64 3
+ %1776 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1638, <4 x float> %1233, i32 0, i32 0, i32 0)
+ %1777 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1639, <4 x float> %1776, i32 0, i32 0, i32 0)
+ %1778 = extractelement <4 x float> %1777, i64 0
+ %1779 = extractelement <4 x float> %1777, i64 1
+ %1780 = extractelement <4 x float> %1777, i64 2
+ %1781 = extractelement <4 x float> %1777, i64 3
+ %1782 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1640, <4 x float> %1239, i32 0, i32 0, i32 0)
+ %1783 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1641, <4 x float> %1782, i32 0, i32 0, i32 0)
+ %1784 = extractelement <4 x float> %1783, i64 0
+ %1785 = extractelement <4 x float> %1783, i64 1
+ %1786 = extractelement <4 x float> %1783, i64 2
+ %1787 = extractelement <4 x float> %1783, i64 3
+ %1788 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1640, <4 x float> %1245, i32 0, i32 0, i32 0)
+ %1789 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1641, <4 x float> %1788, i32 0, i32 0, i32 0)
+ %1790 = extractelement <4 x float> %1789, i64 0
+ %1791 = extractelement <4 x float> %1789, i64 1
+ %1792 = extractelement <4 x float> %1789, i64 2
+ %1793 = extractelement <4 x float> %1789, i64 3
+ %1794 = load <8 x half>, ptr addrspace(3) %251, align 16
+ %1795 = load <8 x half>, ptr addrspace(3) %253, align 16
+ %1796 = load <8 x half>, ptr addrspace(3) %258, align 16
+ %1797 = load <8 x half>, ptr addrspace(3) %260, align 16
+ %1798 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1690, <4 x float> %1255, i32 0, i32 0, i32 0)
+ %1799 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1691, <4 x float> %1798, i32 0, i32 0, i32 0)
+ %1800 = extractelement <4 x float> %1799, i64 0
+ %1801 = extractelement <4 x float> %1799, i64 1
+ %1802 = extractelement <4 x float> %1799, i64 2
+ %1803 = extractelement <4 x float> %1799, i64 3
+ %1804 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1690, <4 x float> %1261, i32 0, i32 0, i32 0)
+ %1805 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1691, <4 x float> %1804, i32 0, i32 0, i32 0)
+ %1806 = extractelement <4 x float> %1805, i64 0
+ %1807 = extractelement <4 x float> %1805, i64 1
+ %1808 = extractelement <4 x float> %1805, i64 2
+ %1809 = extractelement <4 x float> %1805, i64 3
+ %1810 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1476, <4 x half> %1692, <4 x float> %1267, i32 0, i32 0, i32 0)
+ %1811 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1477, <4 x half> %1693, <4 x float> %1810, i32 0, i32 0, i32 0)
+ %1812 = extractelement <4 x float> %1811, i64 0
+ %1813 = extractelement <4 x float> %1811, i64 1
+ %1814 = extractelement <4 x float> %1811, i64 2
+ %1815 = extractelement <4 x float> %1811, i64 3
+ %1816 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1478, <4 x half> %1692, <4 x float> %1273, i32 0, i32 0, i32 0)
+ %1817 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1479, <4 x half> %1693, <4 x float> %1816, i32 0, i32 0, i32 0)
+ %1818 = extractelement <4 x float> %1817, i64 0
+ %1819 = extractelement <4 x float> %1817, i64 1
+ %1820 = extractelement <4 x float> %1817, i64 2
+ %1821 = extractelement <4 x float> %1817, i64 3
+ %1822 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1690, <4 x float> %1279, i32 0, i32 0, i32 0)
+ %1823 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1691, <4 x float> %1822, i32 0, i32 0, i32 0)
+ %1824 = extractelement <4 x float> %1823, i64 0
+ %1825 = extractelement <4 x float> %1823, i64 1
+ %1826 = extractelement <4 x float> %1823, i64 2
+ %1827 = extractelement <4 x float> %1823, i64 3
+ %1828 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1690, <4 x float> %1285, i32 0, i32 0, i32 0)
+ %1829 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1691, <4 x float> %1828, i32 0, i32 0, i32 0)
+ %1830 = extractelement <4 x float> %1829, i64 0
+ %1831 = extractelement <4 x float> %1829, i64 1
+ %1832 = extractelement <4 x float> %1829, i64 2
+ %1833 = extractelement <4 x float> %1829, i64 3
+ %1834 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1504, <4 x half> %1692, <4 x float> %1291, i32 0, i32 0, i32 0)
+ %1835 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1505, <4 x half> %1693, <4 x float> %1834, i32 0, i32 0, i32 0)
+ %1836 = extractelement <4 x float> %1835, i64 0
+ %1837 = extractelement <4 x float> %1835, i64 1
+ %1838 = extractelement <4 x float> %1835, i64 2
+ %1839 = extractelement <4 x float> %1835, i64 3
+ %1840 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1506, <4 x half> %1692, <4 x float> %1297, i32 0, i32 0, i32 0)
+ %1841 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %1507, <4 x half> %1693, <4 x float> %1840, i32 0, i32 0, i32 0)
+ %1842 = extractelement <4 x float> %1841, i64 0
+ %1843 = extractelement <4 x float> %1841, i64 1
+ %1844 = extractelement <4 x float> %1841, i64 2
+ %1845 = extractelement <4 x float> %1841, i64 3
+ %1846 = add nuw nsw i32 %769, 1
+ %exitcond.not = icmp eq i32 %769, %413
+ %1847 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1848 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1849 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1850 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1851 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1852 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1853 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1854 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1855 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1856 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1857 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1858 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1859 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1860 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1861 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1862 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1863 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1864 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1865 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1866 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1867 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1868 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1869 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1870 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1871 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1872 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1873 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1874 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1875 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1876 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1877 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1878 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1879 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1880 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1881 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1882 = shufflevector <8 x half> %1766, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1883 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1884 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1885 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1886 = shufflevector <8 x half> %1767, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1887 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1888 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1889 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1890 = shufflevector <8 x half> %1768, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1891 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1892 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1893 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1894 = shufflevector <8 x half> %1769, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1895 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1896 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1897 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1898 = shufflevector <8 x half> %1794, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1899 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1900 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1901 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1902 = shufflevector <8 x half> %1795, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1903 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1904 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1905 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1906 = shufflevector <8 x half> %1796, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1907 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+ %1908 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1909 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1910 = shufflevector <8 x half> %1797, <8 x half> poison, <2 x i32> <i32 6, i32 7>
+ %1911 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1912 = shufflevector <8 x half> %1461, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1913 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1914 = shufflevector <8 x half> %1463, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1915 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1916 = shufflevector <8 x half> %1465, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1917 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1918 = shufflevector <8 x half> %1467, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1919 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1920 = shufflevector <8 x half> %1469, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1921 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1922 = shufflevector <8 x half> %1471, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1923 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1924 = shufflevector <8 x half> %1473, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1925 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1926 = shufflevector <8 x half> %1475, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1927 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1928 = shufflevector <8 x half> %1623, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1929 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1930 = shufflevector <8 x half> %1625, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1931 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1932 = shufflevector <8 x half> %1627, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1933 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1934 = shufflevector <8 x half> %1629, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1935 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1936 = shufflevector <8 x half> %1631, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1937 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1938 = shufflevector <8 x half> %1633, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1939 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1940 = shufflevector <8 x half> %1635, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ %1941 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> <i32 2, i32 3>
+ %1942 = shufflevector <8 x half> %1637, <8 x half> poison, <2 x i32> <i32 4, i32 5>
+ br i1 %exitcond.not, label %._crit_edge, label %510
+
+._crit_edge: ; preds = %510, %.._crit_edge_crit_edge
+ %.pre-phi1068 = phi i32 [ %.pre1067, %.._crit_edge_crit_edge ], [ %410, %510 ]
+ %.pre-phi1066 = phi i32 [ %.pre1065, %.._crit_edge_crit_edge ], [ %409, %510 ]
+ %.pre-phi1064 = phi i32 [ %.pre1063, %.._crit_edge_crit_edge ], [ %406, %510 ]
+ %.pre-phi1062 = phi i32 [ %.pre1061, %.._crit_edge_crit_edge ], [ %405, %510 ]
+ %.pre-phi1060 = phi i32 [ %.pre1059, %.._crit_edge_crit_edge ], [ %402, %510 ]
+ %.pre-phi1058 = phi i32 [ %.pre1057, %.._crit_edge_crit_edge ], [ %401, %510 ]
+ %.pre-phi1056 = phi i32 [ %.pre1055, %.._crit_edge_crit_edge ], [ %398, %510 ]
+ %.pre-phi1054 = phi i32 [ %.pre1053, %.._crit_edge_crit_edge ], [ %397, %510 ]
+ %.pre-phi1052 = phi i32 [ %.pre1051, %.._crit_edge_crit_edge ], [ %394, %510 ]
+ %.pre-phi1050 = phi i32 [ %.pre1049, %.._crit_edge_crit_edge ], [ %393, %510 ]
+ %.pre-phi1048 = phi i32 [ %.pre1047, %.._crit_edge_crit_edge ], [ %390, %510 ]
+ %.pre-phi1046 = phi i32 [ %.pre1045, %.._crit_edge_crit_edge ], [ %384, %510 ]
+ %.pre-phi1044 = phi i32 [ %.pre1043, %.._crit_edge_crit_edge ], [ %383, %510 ]
+ %.pre-phi1042 = phi i32 [ %.pre1041, %.._crit_edge_crit_edge ], [ %387, %510 ]
+ %.pre-phi1034 = phi i32 [ %.pre1033, %.._crit_edge_crit_edge ], [ %377, %510 ]
+ %.pre-phi1030 = phi i32 [ %.pre1029, %.._crit_edge_crit_edge ], [ %375, %510 ]
+ %.pre-phi1026 = phi i32 [ %.pre1025, %.._crit_edge_crit_edge ], [ %371, %510 ]
+ %.pre-phi1022 = phi i32 [ %.pre1021, %.._crit_edge_crit_edge ], [ %369, %510 ]
+ %.pre-phi1018 = phi i32 [ %.pre1017, %.._crit_edge_crit_edge ], [ %365, %510 ]
+ %.pre-phi1014 = phi i32 [ %.pre1013, %.._crit_edge_crit_edge ], [ %361, %510 ]
+ %1943 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1308, %510 ], !dbg !167
+ %1944 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1309, %510 ], !dbg !167
+ %1945 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1310, %510 ], !dbg !167
+ %1946 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1311, %510 ], !dbg !167
+ %1947 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1314, %510 ], !dbg !167
+ %1948 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1315, %510 ], !dbg !167
+ %1949 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1316, %510 ], !dbg !167
+ %1950 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1317, %510 ], !dbg !167
+ %1951 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1320, %510 ], !dbg !167
+ %1952 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1321, %510 ], !dbg !167
+ %1953 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1322, %510 ], !dbg !167
+ %1954 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1323, %510 ], !dbg !167
+ %1955 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1326, %510 ], !dbg !167
+ %1956 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1327, %510 ], !dbg !167
+ %1957 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1328, %510 ], !dbg !167
+ %1958 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1329, %510 ], !dbg !167
+ %1959 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1340, %510 ], !dbg !168
+ %1960 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1341, %510 ], !dbg !168
+ %1961 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1342, %510 ], !dbg !168
+ %1962 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1343, %510 ], !dbg !168
+ %1963 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1346, %510 ], !dbg !168
+ %1964 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1347, %510 ], !dbg !168
+ %1965 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1348, %510 ], !dbg !168
+ %1966 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1349, %510 ], !dbg !168
+ %1967 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1352, %510 ], !dbg !168
+ %1968 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1353, %510 ], !dbg !168
+ %1969 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1354, %510 ], !dbg !168
+ %1970 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1355, %510 ], !dbg !168
+ %1971 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1358, %510 ], !dbg !168
+ %1972 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1359, %510 ], !dbg !168
+ %1973 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1360, %510 ], !dbg !168
+ %1974 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1361, %510 ], !dbg !168
+ %1975 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1482, %510 ], !dbg !169
+ %1976 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1483, %510 ], !dbg !169
+ %1977 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1484, %510 ], !dbg !169
+ %1978 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1485, %510 ], !dbg !169
+ %1979 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1488, %510 ], !dbg !169
+ %1980 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1489, %510 ], !dbg !169
+ %1981 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1490, %510 ], !dbg !169
+ %1982 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1491, %510 ], !dbg !169
+ %1983 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1494, %510 ], !dbg !169
+ %1984 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1495, %510 ], !dbg !169
+ %1985 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1496, %510 ], !dbg !169
+ %1986 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1497, %510 ], !dbg !169
+ %1987 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1500, %510 ], !dbg !169
+ %1988 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1501, %510 ], !dbg !169
+ %1989 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1502, %510 ], !dbg !169
+ %1990 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1503, %510 ], !dbg !169
+ %1991 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1510, %510 ], !dbg !170
+ %1992 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1511, %510 ], !dbg !170
+ %1993 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1512, %510 ], !dbg !170
+ %1994 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1513, %510 ], !dbg !170
+ %1995 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1516, %510 ], !dbg !170
+ %1996 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1517, %510 ], !dbg !170
+ %1997 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1518, %510 ], !dbg !170
+ %1998 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1519, %510 ], !dbg !170
+ %1999 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1522, %510 ], !dbg !170
+ %2000 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1523, %510 ], !dbg !170
+ %2001 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1524, %510 ], !dbg !170
+ %2002 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1525, %510 ], !dbg !170
+ %2003 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1528, %510 ], !dbg !170
+ %2004 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1529, %510 ], !dbg !170
+ %2005 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1530, %510 ], !dbg !170
+ %2006 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1531, %510 ], !dbg !170
+ %2007 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1372, %510 ], !dbg !171
+ %2008 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1373, %510 ], !dbg !171
+ %2009 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1374, %510 ], !dbg !171
+ %2010 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1375, %510 ], !dbg !171
+ %2011 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1378, %510 ], !dbg !171
+ %2012 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1379, %510 ], !dbg !171
+ %2013 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1380, %510 ], !dbg !171
+ %2014 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1381, %510 ], !dbg !171
+ %2015 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1384, %510 ], !dbg !171
+ %2016 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1385, %510 ], !dbg !171
+ %2017 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1386, %510 ], !dbg !171
+ %2018 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1387, %510 ], !dbg !171
+ %2019 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1390, %510 ], !dbg !171
+ %2020 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1391, %510 ], !dbg !171
+ %2021 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1392, %510 ], !dbg !171
+ %2022 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1393, %510 ], !dbg !171
+ %2023 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1396, %510 ], !dbg !172
+ %2024 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1397, %510 ], !dbg !172
+ %2025 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1398, %510 ], !dbg !172
+ %2026 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1399, %510 ], !dbg !172
+ %2027 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1402, %510 ], !dbg !172
+ %2028 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1403, %510 ], !dbg !172
+ %2029 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1404, %510 ], !dbg !172
+ %2030 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1405, %510 ], !dbg !172
+ %2031 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1408, %510 ], !dbg !172
+ %2032 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1409, %510 ], !dbg !172
+ %2033 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1410, %510 ], !dbg !172
+ %2034 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1411, %510 ], !dbg !172
+ %2035 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1414, %510 ], !dbg !172
+ %2036 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1415, %510 ], !dbg !172
+ %2037 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1416, %510 ], !dbg !172
+ %2038 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1417, %510 ], !dbg !172
+ %2039 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1534, %510 ], !dbg !173
+ %2040 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1535, %510 ], !dbg !173
+ %2041 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1536, %510 ], !dbg !173
+ %2042 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1537, %510 ], !dbg !173
+ %2043 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1540, %510 ], !dbg !173
+ %2044 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1541, %510 ], !dbg !173
+ %2045 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1542, %510 ], !dbg !173
+ %2046 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1543, %510 ], !dbg !173
+ %2047 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1546, %510 ], !dbg !173
+ %2048 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1547, %510 ], !dbg !173
+ %2049 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1548, %510 ], !dbg !173
+ %2050 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1549, %510 ], !dbg !173
+ %2051 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1552, %510 ], !dbg !173
+ %2052 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1553, %510 ], !dbg !173
+ %2053 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1554, %510 ], !dbg !173
+ %2054 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1555, %510 ], !dbg !173
+ %2055 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1558, %510 ], !dbg !174
+ %2056 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1559, %510 ], !dbg !174
+ %2057 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1560, %510 ], !dbg !174
+ %2058 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1561, %510 ], !dbg !174
+ %2059 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1564, %510 ], !dbg !174
+ %2060 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1565, %510 ], !dbg !174
+ %2061 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1566, %510 ], !dbg !174
+ %2062 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1567, %510 ], !dbg !174
+ %2063 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1570, %510 ], !dbg !174
+ %2064 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1571, %510 ], !dbg !174
+ %2065 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1572, %510 ], !dbg !174
+ %2066 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1573, %510 ], !dbg !174
+ %2067 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1576, %510 ], !dbg !174
+ %2068 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1577, %510 ], !dbg !174
+ %2069 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1578, %510 ], !dbg !174
+ %2070 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1579, %510 ], !dbg !174
+ %2071 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1644, %510 ], !dbg !175
+ %2072 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1645, %510 ], !dbg !175
+ %2073 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1646, %510 ], !dbg !175
+ %2074 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1647, %510 ], !dbg !175
+ %2075 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1650, %510 ], !dbg !175
+ %2076 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1651, %510 ], !dbg !175
+ %2077 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1652, %510 ], !dbg !175
+ %2078 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1653, %510 ], !dbg !175
+ %2079 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1656, %510 ], !dbg !175
+ %2080 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1657, %510 ], !dbg !175
+ %2081 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1658, %510 ], !dbg !175
+ %2082 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1659, %510 ], !dbg !175
+ %2083 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1662, %510 ], !dbg !175
+ %2084 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1663, %510 ], !dbg !175
+ %2085 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1664, %510 ], !dbg !175
+ %2086 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1665, %510 ], !dbg !175
+ %2087 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1668, %510 ], !dbg !176
+ %2088 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1669, %510 ], !dbg !176
+ %2089 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1670, %510 ], !dbg !176
+ %2090 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1671, %510 ], !dbg !176
+ %2091 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1674, %510 ], !dbg !176
+ %2092 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1675, %510 ], !dbg !176
+ %2093 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1676, %510 ], !dbg !176
+ %2094 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1677, %510 ], !dbg !176
+ %2095 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1680, %510 ], !dbg !176
+ %2096 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1681, %510 ], !dbg !176
+ %2097 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1682, %510 ], !dbg !176
+ %2098 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1683, %510 ], !dbg !176
+ %2099 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1686, %510 ], !dbg !176
+ %2100 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1687, %510 ], !dbg !176
+ %2101 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1688, %510 ], !dbg !176
+ %2102 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1689, %510 ], !dbg !176
+ %2103 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1744, %510 ], !dbg !177
+ %2104 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1745, %510 ], !dbg !177
+ %2105 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1746, %510 ], !dbg !177
+ %2106 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1747, %510 ], !dbg !177
+ %2107 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1750, %510 ], !dbg !177
+ %2108 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1751, %510 ], !dbg !177
+ %2109 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1752, %510 ], !dbg !177
+ %2110 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1753, %510 ], !dbg !177
+ %2111 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1756, %510 ], !dbg !177
+ %2112 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1757, %510 ], !dbg !177
+ %2113 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1758, %510 ], !dbg !177
+ %2114 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1759, %510 ], !dbg !177
+ %2115 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1762, %510 ], !dbg !177
+ %2116 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1763, %510 ], !dbg !177
+ %2117 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1764, %510 ], !dbg !177
+ %2118 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1765, %510 ], !dbg !177
+ %2119 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1772, %510 ], !dbg !178
+ %2120 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1773, %510 ], !dbg !178
+ %2121 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1774, %510 ], !dbg !178
+ %2122 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1775, %510 ], !dbg !178
+ %2123 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1778, %510 ], !dbg !178
+ %2124 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1779, %510 ], !dbg !178
+ %2125 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1780, %510 ], !dbg !178
+ %2126 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1781, %510 ], !dbg !178
+ %2127 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1784, %510 ], !dbg !178
+ %2128 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1785, %510 ], !dbg !178
+ %2129 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1786, %510 ], !dbg !178
+ %2130 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1787, %510 ], !dbg !178
+ %2131 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1790, %510 ], !dbg !178
+ %2132 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1791, %510 ], !dbg !178
+ %2133 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1792, %510 ], !dbg !178
+ %2134 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1793, %510 ], !dbg !178
+ %2135 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1696, %510 ], !dbg !179
+ %2136 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1697, %510 ], !dbg !179
+ %2137 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1698, %510 ], !dbg !179
+ %2138 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1699, %510 ], !dbg !179
+ %2139 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1702, %510 ], !dbg !179
+ %2140 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1703, %510 ], !dbg !179
+ %2141 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1704, %510 ], !dbg !179
+ %2142 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1705, %510 ], !dbg !179
+ %2143 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1708, %510 ], !dbg !179
+ %2144 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1709, %510 ], !dbg !179
+ %2145 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1710, %510 ], !dbg !179
+ %2146 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1711, %510 ], !dbg !179
+ %2147 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1714, %510 ], !dbg !179
+ %2148 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1715, %510 ], !dbg !179
+ %2149 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1716, %510 ], !dbg !179
+ %2150 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1717, %510 ], !dbg !179
+ %2151 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1720, %510 ], !dbg !180
+ %2152 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1721, %510 ], !dbg !180
+ %2153 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1722, %510 ], !dbg !180
+ %2154 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1723, %510 ], !dbg !180
+ %2155 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1726, %510 ], !dbg !180
+ %2156 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1727, %510 ], !dbg !180
+ %2157 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1728, %510 ], !dbg !180
+ %2158 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1729, %510 ], !dbg !180
+ %2159 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1732, %510 ], !dbg !180
+ %2160 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1733, %510 ], !dbg !180
+ %2161 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1734, %510 ], !dbg !180
+ %2162 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1735, %510 ], !dbg !180
+ %2163 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1738, %510 ], !dbg !180
+ %2164 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1739, %510 ], !dbg !180
+ %2165 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1740, %510 ], !dbg !180
+ %2166 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1741, %510 ], !dbg !180
+ %2167 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1800, %510 ], !dbg !181
+ %2168 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1801, %510 ], !dbg !181
+ %2169 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1802, %510 ], !dbg !181
+ %2170 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1803, %510 ], !dbg !181
+ %2171 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1806, %510 ], !dbg !181
+ %2172 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1807, %510 ], !dbg !181
+ %2173 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1808, %510 ], !dbg !181
+ %2174 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1809, %510 ], !dbg !181
+ %2175 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1812, %510 ], !dbg !181
+ %2176 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1813, %510 ], !dbg !181
+ %2177 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1814, %510 ], !dbg !181
+ %2178 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1815, %510 ], !dbg !181
+ %2179 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1818, %510 ], !dbg !181
+ %2180 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1819, %510 ], !dbg !181
+ %2181 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1820, %510 ], !dbg !181
+ %2182 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1821, %510 ], !dbg !181
+ %2183 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1824, %510 ], !dbg !182
+ %2184 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1825, %510 ], !dbg !182
+ %2185 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1826, %510 ], !dbg !182
+ %2186 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1827, %510 ], !dbg !182
+ %2187 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1830, %510 ], !dbg !182
+ %2188 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1831, %510 ], !dbg !182
+ %2189 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1832, %510 ], !dbg !182
+ %2190 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1833, %510 ], !dbg !182
+ %2191 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1836, %510 ], !dbg !182
+ %2192 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1837, %510 ], !dbg !182
+ %2193 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1838, %510 ], !dbg !182
+ %2194 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1839, %510 ], !dbg !182
+ %2195 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1842, %510 ], !dbg !182
+ %2196 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1843, %510 ], !dbg !182
+ %2197 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1844, %510 ], !dbg !182
+ %2198 = phi float [ 0.000000e+00, %.._crit_edge_crit_edge ], [ %1845, %510 ], !dbg !182
+ %2199 = phi <2 x half> [ %263, %.._crit_edge_crit_edge ], [ %1847, %510 ]
+ %2200 = phi <2 x half> [ %264, %.._crit_edge_crit_edge ], [ %1848, %510 ]
+ %2201 = phi <2 x half> [ %265, %.._crit_edge_crit_edge ], [ %1849, %510 ]
+ %2202 = phi <2 x half> [ %266, %.._crit_edge_crit_edge ], [ %1850, %510 ]
+ %2203 = phi <2 x half> [ %267, %.._crit_edge_crit_edge ], [ %1851, %510 ]
+ %2204 = phi <2 x half> [ %268, %.._crit_edge_crit_edge ], [ %1852, %510 ]
+ %2205 = phi <2 x half> [ %269, %.._crit_edge_crit_edge ], [ %1853, %510 ]
+ %2206 = phi <2 x half> [ %270, %.._crit_edge_crit_edge ], [ %1854, %510 ]
+ %2207 = phi <2 x half> [ %271, %.._crit_edge_crit_edge ], [ %1855, %510 ]
+ %2208 = phi <2 x half> [ %272, %.._crit_edge_crit_edge ], [ %1856, %510 ]
+ %2209 = phi <2 x half> [ %273, %.._crit_edge_crit_edge ], [ %1857, %510 ]
+ %2210 = phi <2 x half> [ %274, %.._crit_edge_crit_edge ], [ %1858, %510 ]
+ %2211 = phi <2 x half> [ %275, %.._crit_edge_crit_edge ], [ %1859, %510 ]
+ %2212 = phi <2 x half> [ %276, %.._crit_edge_crit_edge ], [ %1860, %510 ]
+ %2213 = phi <2 x half> [ %277, %.._crit_edge_crit_edge ], [ %1861, %510 ]
+ %2214 = phi <2 x half> [ %278, %.._crit_edge_crit_edge ], [ %1862, %510 ]
+ %2215 = phi <2 x half> [ %279, %.._crit_edge_crit_edge ], [ %1863, %510 ]
+ %2216 = phi <2 x half> [ %280, %.._crit_edge_crit_edge ], [ %1864, %510 ]
+ %2217 = phi <2 x half> [ %281, %.._crit_edge_crit_edge ], [ %1865, %510 ]
+ %2218 = phi <2 x half> [ %282, %.._crit_edge_crit_edge ], [ %1866, %510 ]
+ %2219 = phi <2 x half> [ %283, %.._crit_edge_crit_edge ], [ %1867, %510 ]
+ %2220 = phi <2 x half> [ %284, %.._crit_edge_crit_edge ], [ %1868, %510 ]
+ %2221 = phi <2 x half> [ %285, %.._crit_edge_crit_edge ], [ %1869, %510 ]
+ %2222 = phi <2 x half> [ %286, %.._crit_edge_crit_edge ], [ %1870, %510 ]
+ %2223 = phi <2 x half> [ %287, %.._crit_edge_crit_edge ], [ %1871, %510 ]
+ %2224 = phi <2 x half> [ %288, %.._crit_edge_crit_edge ], [ %1872, %510 ]
+ %2225 = phi <2 x half> [ %289, %.._crit_edge_crit_edge ], [ %1873, %510 ]
+ %2226 = phi <2 x half> [ %290, %.._crit_edge_crit_edge ], [ %1874, %510 ]
+ %2227 = phi <2 x half> [ %291, %.._crit_edge_crit_edge ], [ %1875, %510 ]
+ %2228 = phi <2 x half> [ %292, %.._crit_edge_crit_edge ], [ %1876, %510 ]
+ %2229 = phi <2 x half> [ %293, %.._crit_edge_crit_edge ], [ %1877, %510 ]
+ %2230 = phi <2 x half> [ %294, %.._crit_edge_crit_edge ], [ %1878, %510 ]
+ %2231 = phi <2 x half> [ %295, %.._crit_edge_crit_edge ], [ %1879, %510 ]
+ %2232 = phi <2 x half> [ %296, %.._crit_edge_crit_edge ], [ %1880, %510 ]
+ %2233 = phi <2 x half> [ %297, %.._crit_edge_crit_edge ], [ %1881, %510 ]
+ %2234 = phi <2 x half> [ %298, %.._crit_edge_crit_edge ], [ %1882, %510 ]
+ %2235 = phi <2 x half> [ %299, %.._crit_edge_crit_edge ], [ %1883, %510 ]
+ %2236 = phi <2 x half> [ %300, %.._crit_edge_crit_edge ], [ %1884, %510 ]
+ %2237 = phi <2 x half> [ %301, %.._crit_edge_crit_edge ], [ %1885, %510 ]
+ %2238 = phi <2 x half> [ %302, %.._crit_edge_crit_edge ], [ %1886, %510 ]
+ %2239 = phi <2 x half> [ %303, %.._crit_edge_crit_edge ], [ %1887, %510 ]
+ %2240 = phi <2 x half> [ %304, %.._crit_edge_crit_edge ], [ %1888, %510 ]
+ %2241 = phi <2 x half> [ %305, %.._crit_edge_crit_edge ], [ %1889, %510 ]
+ %2242 = phi <2 x half> [ %306, %.._crit_edge_crit_edge ], [ %1890, %510 ]
+ %2243 = phi <2 x half> [ %307, %.._crit_edge_crit_edge ], [ %1891, %510 ]
+ %2244 = phi <2 x half> [ %308, %.._crit_edge_crit_edge ], [ %1892, %510 ]
+ %2245 = phi <2 x half> [ %309, %.._crit_edge_crit_edge ], [ %1893, %510 ]
+ %2246 = phi <2 x half> [ %310, %.._crit_edge_crit_edge ], [ %1894, %510 ]
+ %2247 = phi <2 x half> [ %311, %.._crit_edge_crit_edge ], [ %1895, %510 ]
+ %2248 = phi <2 x half> [ %312, %.._crit_edge_crit_edge ], [ %1896, %510 ]
+ %2249 = phi <2 x half> [ %313, %.._crit_edge_crit_edge ], [ %1897, %510 ]
+ %2250 = phi <2 x half> [ %314, %.._crit_edge_crit_edge ], [ %1898, %510 ]
+ %2251 = phi <2 x half> [ %315, %.._crit_edge_crit_edge ], [ %1899, %510 ]
+ %2252 = phi <2 x half> [ %316, %.._crit_edge_crit_edge ], [ %1900, %510 ]
+ %2253 = phi <2 x half> [ %317, %.._crit_edge_crit_edge ], [ %1901, %510 ]
+ %2254 = phi <2 x half> [ %318, %.._crit_edge_crit_edge ], [ %1902, %510 ]
+ %2255 = phi <2 x half> [ %319, %.._crit_edge_crit_edge ], [ %1903, %510 ]
+ %2256 = phi <2 x half> [ %320, %.._crit_edge_crit_edge ], [ %1904, %510 ]
+ %2257 = phi <2 x half> [ %321, %.._crit_edge_crit_edge ], [ %1905, %510 ]
+ %2258 = phi <2 x half> [ %322, %.._crit_edge_crit_edge ], [ %1906, %510 ]
+ %2259 = phi <2 x half> [ %323, %.._crit_edge_crit_edge ], [ %1907, %510 ]
+ %2260 = phi <2 x half> [ %324, %.._crit_edge_crit_edge ], [ %1908, %510 ]
+ %2261 = phi <2 x half> [ %325, %.._crit_edge_crit_edge ], [ %1909, %510 ]
+ %2262 = phi <2 x half> [ %326, %.._crit_edge_crit_edge ], [ %1910, %510 ]
+ %2263 = phi <2 x half> [ %327, %.._crit_edge_crit_edge ], [ %1911, %510 ]
+ %2264 = phi <2 x half> [ %328, %.._crit_edge_crit_edge ], [ %1912, %510 ]
+ %2265 = phi <2 x half> [ %329, %.._crit_edge_crit_edge ], [ %1913, %510 ]
+ %2266 = phi <2 x half> [ %330, %.._crit_edge_crit_edge ], [ %1914, %510 ]
+ %2267 = phi <2 x half> [ %331, %.._crit_edge_crit_edge ], [ %1915, %510 ]
+ %2268 = phi <2 x half> [ %332, %.._crit_edge_crit_edge ], [ %1916, %510 ]
+ %2269 = phi <2 x half> [ %333, %.._crit_edge_crit_edge ], [ %1917, %510 ]
+ %2270 = phi <2 x half> [ %334, %.._crit_edge_crit_edge ], [ %1918, %510 ]
+ %2271 = phi <2 x half> [ %335, %.._crit_edge_crit_edge ], [ %1919, %510 ]
+ %2272 = phi <2 x half> [ %336, %.._crit_edge_crit_edge ], [ %1920, %510 ]
+ %2273 = phi <2 x half> [ %337, %.._crit_edge_crit_edge ], [ %1921, %510 ]
+ %2274 = phi <2 x half> [ %338, %.._crit_edge_crit_edge ], [ %1922, %510 ]
+ %2275 = phi <2 x half> [ %339, %.._crit_edge_crit_edge ], [ %1923, %510 ]
+ %2276 = phi <2 x half> [ %340, %.._crit_edge_crit_edge ], [ %1924, %510 ]
+ %2277 = phi <2 x half> [ %341, %.._crit_edge_crit_edge ], [ %1925, %510 ]
+ %2278 = phi <2 x half> [ %342, %.._crit_edge_crit_edge ], [ %1926, %510 ]
+ %2279 = phi <2 x half> [ %343, %.._crit_edge_crit_edge ], [ %1927, %510 ]
+ %2280 = phi <2 x half> [ %344, %.._crit_edge_crit_edge ], [ %1928, %510 ]
+ %2281 = phi <2 x half> [ %345, %.._crit_edge_crit_edge ], [ %1929, %510 ]
+ %2282 = phi <2 x half> [ %346, %.._crit_edge_crit_edge ], [ %1930, %510 ]
+ %2283 = phi <2 x half> [ %347, %.._crit_edge_crit_edge ], [ %1931, %510 ]
+ %2284 = phi <2 x half> [ %348, %.._crit_edge_crit_edge ], [ %1932, %510 ]
+ %2285 = phi <2 x half> [ %349, %.._crit_edge_crit_edge ], [ %1933, %510 ]
+ %2286 = phi <2 x half> [ %350, %.._crit_edge_crit_edge ], [ %1934, %510 ]
+ %2287 = phi <2 x half> [ %351, %.._crit_edge_crit_edge ], [ %1935, %510 ]
+ %2288 = phi <2 x half> [ %352, %.._crit_edge_crit_edge ], [ %1936, %510 ]
+ %2289 = phi <2 x half> [ %353, %.._crit_edge_crit_edge ], [ %1937, %510 ]
+ %2290 = phi <2 x half> [ %354, %.._crit_edge_crit_edge ], [ %1938, %510 ]
+ %2291 = phi <2 x half> [ %355, %.._crit_edge_crit_edge ], [ %1939, %510 ]
+ %2292 = phi <2 x half> [ %356, %.._crit_edge_crit_edge ], [ %1940, %510 ]
+ %2293 = phi <2 x half> [ %357, %.._crit_edge_crit_edge ], [ %1941, %510 ]
+ %2294 = phi <2 x half> [ %358, %.._crit_edge_crit_edge ], [ %1942, %510 ]
+ %2295 = and i32 %237, 28
+ %2296 = or disjoint i32 %2295, 224
+ %2297 = or disjoint i32 %2295, 192
+ %2298 = or disjoint i32 %2295, 160
+ %2299 = or disjoint i32 %2295, 128
+ %2300 = or disjoint i32 %2295, 96
+ %2301 = or disjoint i32 %2295, 64
+ %2302 = or disjoint i32 %2295, 32
+ %2303 = or disjoint i32 %225, 224
+ %2304 = or disjoint i32 %225, 192
+ %2305 = or disjoint i32 %225, 160
+ %2306 = or disjoint i32 %225, 128
+ %2307 = or disjoint i32 %225, 96
+ %2308 = or disjoint i32 %225, 64
+ %2309 = or disjoint i32 %225, 32
+ %2310 = shufflevector <2 x half> %2231, <2 x half> %2232, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2311 = shufflevector <2 x half> %2233, <2 x half> %2234, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2312 = shufflevector <2 x half> %2235, <2 x half> %2236, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2313 = shufflevector <2 x half> %2237, <2 x half> %2238, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2314 = shufflevector <2 x half> %2239, <2 x half> %2240, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2315 = shufflevector <2 x half> %2241, <2 x half> %2242, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2316 = shufflevector <2 x half> %2243, <2 x half> %2244, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2317 = shufflevector <2 x half> %2245, <2 x half> %2246, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2318 = insertelement <4 x float> poison, float %1943, i64 0
+ %2319 = insertelement <4 x float> %2318, float %1944, i64 1
+ %2320 = insertelement <4 x float> %2319, float %1945, i64 2
+ %2321 = insertelement <4 x float> %2320, float %1946, i64 3
+ %2322 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2310, <4 x float> %2321, i32 0, i32 0, i32 0)
+ %2323 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2311, <4 x float> %2322, i32 0, i32 0, i32 0)
+ %2324 = insertelement <4 x float> poison, float %1947, i64 0
+ %2325 = insertelement <4 x float> %2324, float %1948, i64 1
+ %2326 = insertelement <4 x float> %2325, float %1949, i64 2
+ %2327 = insertelement <4 x float> %2326, float %1950, i64 3
+ %2328 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2310, <4 x float> %2327, i32 0, i32 0, i32 0)
+ %2329 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2311, <4 x float> %2328, i32 0, i32 0, i32 0)
+ %2330 = insertelement <4 x float> poison, float %1951, i64 0
+ %2331 = insertelement <4 x float> %2330, float %1952, i64 1
+ %2332 = insertelement <4 x float> %2331, float %1953, i64 2
+ %2333 = insertelement <4 x float> %2332, float %1954, i64 3
+ %2334 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2312, <4 x float> %2333, i32 0, i32 0, i32 0)
+ %2335 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2313, <4 x float> %2334, i32 0, i32 0, i32 0)
+ %2336 = insertelement <4 x float> poison, float %1955, i64 0
+ %2337 = insertelement <4 x float> %2336, float %1956, i64 1
+ %2338 = insertelement <4 x float> %2337, float %1957, i64 2
+ %2339 = insertelement <4 x float> %2338, float %1958, i64 3
+ %2340 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2312, <4 x float> %2339, i32 0, i32 0, i32 0)
+ %2341 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2313, <4 x float> %2340, i32 0, i32 0, i32 0)
+ %2342 = shufflevector <2 x half> %2255, <2 x half> %2256, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2343 = shufflevector <2 x half> %2257, <2 x half> %2258, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2344 = shufflevector <2 x half> %2259, <2 x half> %2260, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2345 = shufflevector <2 x half> %2261, <2 x half> %2262, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2346 = insertelement <4 x float> poison, float %1959, i64 0
+ %2347 = insertelement <4 x float> %2346, float %1960, i64 1
+ %2348 = insertelement <4 x float> %2347, float %1961, i64 2
+ %2349 = insertelement <4 x float> %2348, float %1962, i64 3
+ %2350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2310, <4 x float> %2349, i32 0, i32 0, i32 0)
+ %2351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2311, <4 x float> %2350, i32 0, i32 0, i32 0)
+ %2352 = insertelement <4 x float> poison, float %1963, i64 0
+ %2353 = insertelement <4 x float> %2352, float %1964, i64 1
+ %2354 = insertelement <4 x float> %2353, float %1965, i64 2
+ %2355 = insertelement <4 x float> %2354, float %1966, i64 3
+ %2356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2310, <4 x float> %2355, i32 0, i32 0, i32 0)
+ %2357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2311, <4 x float> %2356, i32 0, i32 0, i32 0)
+ %2358 = insertelement <4 x float> poison, float %1967, i64 0
+ %2359 = insertelement <4 x float> %2358, float %1968, i64 1
+ %2360 = insertelement <4 x float> %2359, float %1969, i64 2
+ %2361 = insertelement <4 x float> %2360, float %1970, i64 3
+ %2362 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2312, <4 x float> %2361, i32 0, i32 0, i32 0)
+ %2363 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2313, <4 x float> %2362, i32 0, i32 0, i32 0)
+ %2364 = insertelement <4 x float> poison, float %1971, i64 0
+ %2365 = insertelement <4 x float> %2364, float %1972, i64 1
+ %2366 = insertelement <4 x float> %2365, float %1973, i64 2
+ %2367 = insertelement <4 x float> %2366, float %1974, i64 3
+ %2368 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2312, <4 x float> %2367, i32 0, i32 0, i32 0)
+ %2369 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2313, <4 x float> %2368, i32 0, i32 0, i32 0)
+ %2370 = shufflevector <2 x half> %2247, <2 x half> %2248, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2371 = shufflevector <2 x half> %2249, <2 x half> %2250, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2372 = shufflevector <2 x half> %2251, <2 x half> %2252, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2373 = shufflevector <2 x half> %2253, <2 x half> %2254, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2374 = insertelement <4 x float> poison, float %2007, i64 0
+ %2375 = insertelement <4 x float> %2374, float %2008, i64 1
+ %2376 = insertelement <4 x float> %2375, float %2009, i64 2
+ %2377 = insertelement <4 x float> %2376, float %2010, i64 3
+ %2378 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2370, <4 x float> %2377, i32 0, i32 0, i32 0)
+ %2379 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2371, <4 x float> %2378, i32 0, i32 0, i32 0)
+ %2380 = insertelement <4 x float> poison, float %2011, i64 0
+ %2381 = insertelement <4 x float> %2380, float %2012, i64 1
+ %2382 = insertelement <4 x float> %2381, float %2013, i64 2
+ %2383 = insertelement <4 x float> %2382, float %2014, i64 3
+ %2384 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2370, <4 x float> %2383, i32 0, i32 0, i32 0)
+ %2385 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2371, <4 x float> %2384, i32 0, i32 0, i32 0)
+ %2386 = insertelement <4 x float> poison, float %2015, i64 0
+ %2387 = insertelement <4 x float> %2386, float %2016, i64 1
+ %2388 = insertelement <4 x float> %2387, float %2017, i64 2
+ %2389 = insertelement <4 x float> %2388, float %2018, i64 3
+ %2390 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2372, <4 x float> %2389, i32 0, i32 0, i32 0)
+ %2391 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2373, <4 x float> %2390, i32 0, i32 0, i32 0)
+ %2392 = insertelement <4 x float> poison, float %2019, i64 0
+ %2393 = insertelement <4 x float> %2392, float %2020, i64 1
+ %2394 = insertelement <4 x float> %2393, float %2021, i64 2
+ %2395 = insertelement <4 x float> %2394, float %2022, i64 3
+ %2396 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2372, <4 x float> %2395, i32 0, i32 0, i32 0)
+ %2397 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2373, <4 x float> %2396, i32 0, i32 0, i32 0)
+ %2398 = insertelement <4 x float> poison, float %2023, i64 0
+ %2399 = insertelement <4 x float> %2398, float %2024, i64 1
+ %2400 = insertelement <4 x float> %2399, float %2025, i64 2
+ %2401 = insertelement <4 x float> %2400, float %2026, i64 3
+ %2402 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2370, <4 x float> %2401, i32 0, i32 0, i32 0)
+ %2403 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2371, <4 x float> %2402, i32 0, i32 0, i32 0)
+ %2404 = insertelement <4 x float> poison, float %2027, i64 0
+ %2405 = insertelement <4 x float> %2404, float %2028, i64 1
+ %2406 = insertelement <4 x float> %2405, float %2029, i64 2
+ %2407 = insertelement <4 x float> %2406, float %2030, i64 3
+ %2408 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2370, <4 x float> %2407, i32 0, i32 0, i32 0)
+ %2409 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2371, <4 x float> %2408, i32 0, i32 0, i32 0)
+ %2410 = insertelement <4 x float> poison, float %2031, i64 0
+ %2411 = insertelement <4 x float> %2410, float %2032, i64 1
+ %2412 = insertelement <4 x float> %2411, float %2033, i64 2
+ %2413 = insertelement <4 x float> %2412, float %2034, i64 3
+ %2414 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2372, <4 x float> %2413, i32 0, i32 0, i32 0)
+ %2415 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2373, <4 x float> %2414, i32 0, i32 0, i32 0)
+ %2416 = insertelement <4 x float> poison, float %2035, i64 0
+ %2417 = insertelement <4 x float> %2416, float %2036, i64 1
+ %2418 = insertelement <4 x float> %2417, float %2037, i64 2
+ %2419 = insertelement <4 x float> %2418, float %2038, i64 3
+ %2420 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2372, <4 x float> %2419, i32 0, i32 0, i32 0)
+ %2421 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2373, <4 x float> %2420, i32 0, i32 0, i32 0)
+ %2422 = or disjoint i32 %.pre-phi1014, 2048
+ %2423 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1014
+ %2424 = load <8 x half>, ptr addrspace(3) %2423, align 16
+ %2425 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2422
+ %2426 = load <8 x half>, ptr addrspace(3) %2425, align 16
+ %2427 = shufflevector <8 x half> %2424, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2428 = shufflevector <8 x half> %2424, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2429 = shufflevector <8 x half> %2426, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2430 = shufflevector <8 x half> %2426, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2431 = insertelement <4 x float> poison, float %1975, i64 0
+ %2432 = insertelement <4 x float> %2431, float %1976, i64 1
+ %2433 = insertelement <4 x float> %2432, float %1977, i64 2
+ %2434 = insertelement <4 x float> %2433, float %1978, i64 3
+ %2435 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2310, <4 x float> %2434, i32 0, i32 0, i32 0)
+ %2436 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2311, <4 x float> %2435, i32 0, i32 0, i32 0)
+ %2437 = insertelement <4 x float> poison, float %1979, i64 0
+ %2438 = insertelement <4 x float> %2437, float %1980, i64 1
+ %2439 = insertelement <4 x float> %2438, float %1981, i64 2
+ %2440 = insertelement <4 x float> %2439, float %1982, i64 3
+ %2441 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2310, <4 x float> %2440, i32 0, i32 0, i32 0)
+ %2442 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2311, <4 x float> %2441, i32 0, i32 0, i32 0)
+ %2443 = insertelement <4 x float> poison, float %1983, i64 0
+ %2444 = insertelement <4 x float> %2443, float %1984, i64 1
+ %2445 = insertelement <4 x float> %2444, float %1985, i64 2
+ %2446 = insertelement <4 x float> %2445, float %1986, i64 3
+ %2447 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2312, <4 x float> %2446, i32 0, i32 0, i32 0)
+ %2448 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2313, <4 x float> %2447, i32 0, i32 0, i32 0)
+ %2449 = insertelement <4 x float> poison, float %1987, i64 0
+ %2450 = insertelement <4 x float> %2449, float %1988, i64 1
+ %2451 = insertelement <4 x float> %2450, float %1989, i64 2
+ %2452 = insertelement <4 x float> %2451, float %1990, i64 3
+ %2453 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2312, <4 x float> %2452, i32 0, i32 0, i32 0)
+ %2454 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2313, <4 x float> %2453, i32 0, i32 0, i32 0)
+ %2455 = or disjoint i32 %.pre-phi1018, 2048
+ %2456 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1018
+ %2457 = load <8 x half>, ptr addrspace(3) %2456, align 16
+ %2458 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2455
+ %2459 = load <8 x half>, ptr addrspace(3) %2458, align 16
+ %2460 = shufflevector <8 x half> %2457, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2461 = shufflevector <8 x half> %2457, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2462 = shufflevector <8 x half> %2459, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2463 = shufflevector <8 x half> %2459, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2464 = insertelement <4 x float> poison, float %1991, i64 0
+ %2465 = insertelement <4 x float> %2464, float %1992, i64 1
+ %2466 = insertelement <4 x float> %2465, float %1993, i64 2
+ %2467 = insertelement <4 x float> %2466, float %1994, i64 3
+ %2468 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2310, <4 x float> %2467, i32 0, i32 0, i32 0)
+ %2469 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2311, <4 x float> %2468, i32 0, i32 0, i32 0)
+ %2470 = insertelement <4 x float> poison, float %1995, i64 0
+ %2471 = insertelement <4 x float> %2470, float %1996, i64 1
+ %2472 = insertelement <4 x float> %2471, float %1997, i64 2
+ %2473 = insertelement <4 x float> %2472, float %1998, i64 3
+ %2474 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2310, <4 x float> %2473, i32 0, i32 0, i32 0)
+ %2475 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2311, <4 x float> %2474, i32 0, i32 0, i32 0)
+ %2476 = insertelement <4 x float> poison, float %1999, i64 0
+ %2477 = insertelement <4 x float> %2476, float %2000, i64 1
+ %2478 = insertelement <4 x float> %2477, float %2001, i64 2
+ %2479 = insertelement <4 x float> %2478, float %2002, i64 3
+ %2480 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2312, <4 x float> %2479, i32 0, i32 0, i32 0)
+ %2481 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2313, <4 x float> %2480, i32 0, i32 0, i32 0)
+ %2482 = insertelement <4 x float> poison, float %2003, i64 0
+ %2483 = insertelement <4 x float> %2482, float %2004, i64 1
+ %2484 = insertelement <4 x float> %2483, float %2005, i64 2
+ %2485 = insertelement <4 x float> %2484, float %2006, i64 3
+ %2486 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2312, <4 x float> %2485, i32 0, i32 0, i32 0)
+ %2487 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2313, <4 x float> %2486, i32 0, i32 0, i32 0)
+ %2488 = insertelement <4 x float> poison, float %2039, i64 0
+ %2489 = insertelement <4 x float> %2488, float %2040, i64 1
+ %2490 = insertelement <4 x float> %2489, float %2041, i64 2
+ %2491 = insertelement <4 x float> %2490, float %2042, i64 3
+ %2492 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2370, <4 x float> %2491, i32 0, i32 0, i32 0)
+ %2493 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2371, <4 x float> %2492, i32 0, i32 0, i32 0)
+ %2494 = insertelement <4 x float> poison, float %2043, i64 0
+ %2495 = insertelement <4 x float> %2494, float %2044, i64 1
+ %2496 = insertelement <4 x float> %2495, float %2045, i64 2
+ %2497 = insertelement <4 x float> %2496, float %2046, i64 3
+ %2498 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2370, <4 x float> %2497, i32 0, i32 0, i32 0)
+ %2499 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2371, <4 x float> %2498, i32 0, i32 0, i32 0)
+ %2500 = insertelement <4 x float> poison, float %2047, i64 0
+ %2501 = insertelement <4 x float> %2500, float %2048, i64 1
+ %2502 = insertelement <4 x float> %2501, float %2049, i64 2
+ %2503 = insertelement <4 x float> %2502, float %2050, i64 3
+ %2504 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2372, <4 x float> %2503, i32 0, i32 0, i32 0)
+ %2505 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2373, <4 x float> %2504, i32 0, i32 0, i32 0)
+ %2506 = insertelement <4 x float> poison, float %2051, i64 0
+ %2507 = insertelement <4 x float> %2506, float %2052, i64 1
+ %2508 = insertelement <4 x float> %2507, float %2053, i64 2
+ %2509 = insertelement <4 x float> %2508, float %2054, i64 3
+ %2510 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2372, <4 x float> %2509, i32 0, i32 0, i32 0)
+ %2511 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2373, <4 x float> %2510, i32 0, i32 0, i32 0)
+ %2512 = insertelement <4 x float> poison, float %2055, i64 0
+ %2513 = insertelement <4 x float> %2512, float %2056, i64 1
+ %2514 = insertelement <4 x float> %2513, float %2057, i64 2
+ %2515 = insertelement <4 x float> %2514, float %2058, i64 3
+ %2516 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2370, <4 x float> %2515, i32 0, i32 0, i32 0)
+ %2517 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2371, <4 x float> %2516, i32 0, i32 0, i32 0)
+ %2518 = insertelement <4 x float> poison, float %2059, i64 0
+ %2519 = insertelement <4 x float> %2518, float %2060, i64 1
+ %2520 = insertelement <4 x float> %2519, float %2061, i64 2
+ %2521 = insertelement <4 x float> %2520, float %2062, i64 3
+ %2522 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2370, <4 x float> %2521, i32 0, i32 0, i32 0)
+ %2523 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2371, <4 x float> %2522, i32 0, i32 0, i32 0)
+ %2524 = insertelement <4 x float> poison, float %2063, i64 0
+ %2525 = insertelement <4 x float> %2524, float %2064, i64 1
+ %2526 = insertelement <4 x float> %2525, float %2065, i64 2
+ %2527 = insertelement <4 x float> %2526, float %2066, i64 3
+ %2528 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2372, <4 x float> %2527, i32 0, i32 0, i32 0)
+ %2529 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2373, <4 x float> %2528, i32 0, i32 0, i32 0)
+ %2530 = insertelement <4 x float> poison, float %2067, i64 0
+ %2531 = insertelement <4 x float> %2530, float %2068, i64 1
+ %2532 = insertelement <4 x float> %2531, float %2069, i64 2
+ %2533 = insertelement <4 x float> %2532, float %2070, i64 3
+ %2534 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2372, <4 x float> %2533, i32 0, i32 0, i32 0)
+ %2535 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2373, <4 x float> %2534, i32 0, i32 0, i32 0)
+ %2536 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1022
+ %2537 = load <8 x half>, ptr addrspace(3) %2536, align 16
+ %2538 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1026
+ %2539 = load <8 x half>, ptr addrspace(3) %2538, align 16
+ %2540 = shufflevector <8 x half> %2537, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2541 = shufflevector <8 x half> %2537, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2542 = shufflevector <8 x half> %2539, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2543 = shufflevector <8 x half> %2539, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2544 = insertelement <4 x float> poison, float %2071, i64 0
+ %2545 = insertelement <4 x float> %2544, float %2072, i64 1
+ %2546 = insertelement <4 x float> %2545, float %2073, i64 2
+ %2547 = insertelement <4 x float> %2546, float %2074, i64 3
+ %2548 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2540, <4 x float> %2547, i32 0, i32 0, i32 0)
+ %2549 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2541, <4 x float> %2548, i32 0, i32 0, i32 0)
+ %2550 = insertelement <4 x float> poison, float %2075, i64 0
+ %2551 = insertelement <4 x float> %2550, float %2076, i64 1
+ %2552 = insertelement <4 x float> %2551, float %2077, i64 2
+ %2553 = insertelement <4 x float> %2552, float %2078, i64 3
+ %2554 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2540, <4 x float> %2553, i32 0, i32 0, i32 0)
+ %2555 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2541, <4 x float> %2554, i32 0, i32 0, i32 0)
+ %2556 = insertelement <4 x float> poison, float %2079, i64 0
+ %2557 = insertelement <4 x float> %2556, float %2080, i64 1
+ %2558 = insertelement <4 x float> %2557, float %2081, i64 2
+ %2559 = insertelement <4 x float> %2558, float %2082, i64 3
+ %2560 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2542, <4 x float> %2559, i32 0, i32 0, i32 0)
+ %2561 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2543, <4 x float> %2560, i32 0, i32 0, i32 0)
+ %2562 = insertelement <4 x float> poison, float %2083, i64 0
+ %2563 = insertelement <4 x float> %2562, float %2084, i64 1
+ %2564 = insertelement <4 x float> %2563, float %2085, i64 2
+ %2565 = insertelement <4 x float> %2564, float %2086, i64 3
+ %2566 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2542, <4 x float> %2565, i32 0, i32 0, i32 0)
+ %2567 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2543, <4 x float> %2566, i32 0, i32 0, i32 0)
+ %2568 = insertelement <4 x float> poison, float %2087, i64 0
+ %2569 = insertelement <4 x float> %2568, float %2088, i64 1
+ %2570 = insertelement <4 x float> %2569, float %2089, i64 2
+ %2571 = insertelement <4 x float> %2570, float %2090, i64 3
+ %2572 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2540, <4 x float> %2571, i32 0, i32 0, i32 0)
+ %2573 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2541, <4 x float> %2572, i32 0, i32 0, i32 0)
+ %2574 = insertelement <4 x float> poison, float %2091, i64 0
+ %2575 = insertelement <4 x float> %2574, float %2092, i64 1
+ %2576 = insertelement <4 x float> %2575, float %2093, i64 2
+ %2577 = insertelement <4 x float> %2576, float %2094, i64 3
+ %2578 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2540, <4 x float> %2577, i32 0, i32 0, i32 0)
+ %2579 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2541, <4 x float> %2578, i32 0, i32 0, i32 0)
+ %2580 = insertelement <4 x float> poison, float %2095, i64 0
+ %2581 = insertelement <4 x float> %2580, float %2096, i64 1
+ %2582 = insertelement <4 x float> %2581, float %2097, i64 2
+ %2583 = insertelement <4 x float> %2582, float %2098, i64 3
+ %2584 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2542, <4 x float> %2583, i32 0, i32 0, i32 0)
+ %2585 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2543, <4 x float> %2584, i32 0, i32 0, i32 0)
+ %2586 = insertelement <4 x float> poison, float %2099, i64 0
+ %2587 = insertelement <4 x float> %2586, float %2100, i64 1
+ %2588 = insertelement <4 x float> %2587, float %2101, i64 2
+ %2589 = insertelement <4 x float> %2588, float %2102, i64 3
+ %2590 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2542, <4 x float> %2589, i32 0, i32 0, i32 0)
+ %2591 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2543, <4 x float> %2590, i32 0, i32 0, i32 0)
+ %2592 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1030
+ %2593 = load <8 x half>, ptr addrspace(3) %2592, align 16
+ %2594 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1034
+ %2595 = load <8 x half>, ptr addrspace(3) %2594, align 16
+ %2596 = shufflevector <8 x half> %2593, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2597 = shufflevector <8 x half> %2593, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2598 = shufflevector <8 x half> %2595, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2599 = shufflevector <8 x half> %2595, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2600 = insertelement <4 x float> poison, float %2135, i64 0
+ %2601 = insertelement <4 x float> %2600, float %2136, i64 1
+ %2602 = insertelement <4 x float> %2601, float %2137, i64 2
+ %2603 = insertelement <4 x float> %2602, float %2138, i64 3
+ %2604 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2596, <4 x float> %2603, i32 0, i32 0, i32 0)
+ %2605 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2597, <4 x float> %2604, i32 0, i32 0, i32 0)
+ %2606 = insertelement <4 x float> poison, float %2139, i64 0
+ %2607 = insertelement <4 x float> %2606, float %2140, i64 1
+ %2608 = insertelement <4 x float> %2607, float %2141, i64 2
+ %2609 = insertelement <4 x float> %2608, float %2142, i64 3
+ %2610 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2596, <4 x float> %2609, i32 0, i32 0, i32 0)
+ %2611 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2597, <4 x float> %2610, i32 0, i32 0, i32 0)
+ %2612 = insertelement <4 x float> poison, float %2143, i64 0
+ %2613 = insertelement <4 x float> %2612, float %2144, i64 1
+ %2614 = insertelement <4 x float> %2613, float %2145, i64 2
+ %2615 = insertelement <4 x float> %2614, float %2146, i64 3
+ %2616 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2314, <4 x half> %2598, <4 x float> %2615, i32 0, i32 0, i32 0)
+ %2617 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2315, <4 x half> %2599, <4 x float> %2616, i32 0, i32 0, i32 0)
+ %2618 = insertelement <4 x float> poison, float %2147, i64 0
+ %2619 = insertelement <4 x float> %2618, float %2148, i64 1
+ %2620 = insertelement <4 x float> %2619, float %2149, i64 2
+ %2621 = insertelement <4 x float> %2620, float %2150, i64 3
+ %2622 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2316, <4 x half> %2598, <4 x float> %2621, i32 0, i32 0, i32 0)
+ %2623 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2317, <4 x half> %2599, <4 x float> %2622, i32 0, i32 0, i32 0)
+ %2624 = insertelement <4 x float> poison, float %2151, i64 0
+ %2625 = insertelement <4 x float> %2624, float %2152, i64 1
+ %2626 = insertelement <4 x float> %2625, float %2153, i64 2
+ %2627 = insertelement <4 x float> %2626, float %2154, i64 3
+ %2628 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2596, <4 x float> %2627, i32 0, i32 0, i32 0)
+ %2629 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2597, <4 x float> %2628, i32 0, i32 0, i32 0)
+ %2630 = insertelement <4 x float> poison, float %2155, i64 0
+ %2631 = insertelement <4 x float> %2630, float %2156, i64 1
+ %2632 = insertelement <4 x float> %2631, float %2157, i64 2
+ %2633 = insertelement <4 x float> %2632, float %2158, i64 3
+ %2634 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2596, <4 x float> %2633, i32 0, i32 0, i32 0)
+ %2635 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2597, <4 x float> %2634, i32 0, i32 0, i32 0)
+ %2636 = insertelement <4 x float> poison, float %2159, i64 0
+ %2637 = insertelement <4 x float> %2636, float %2160, i64 1
+ %2638 = insertelement <4 x float> %2637, float %2161, i64 2
+ %2639 = insertelement <4 x float> %2638, float %2162, i64 3
+ %2640 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2342, <4 x half> %2598, <4 x float> %2639, i32 0, i32 0, i32 0)
+ %2641 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2343, <4 x half> %2599, <4 x float> %2640, i32 0, i32 0, i32 0)
+ %2642 = insertelement <4 x float> poison, float %2163, i64 0
+ %2643 = insertelement <4 x float> %2642, float %2164, i64 1
+ %2644 = insertelement <4 x float> %2643, float %2165, i64 2
+ %2645 = insertelement <4 x float> %2644, float %2166, i64 3
+ %2646 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2344, <4 x half> %2598, <4 x float> %2645, i32 0, i32 0, i32 0)
+ %2647 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2345, <4 x half> %2599, <4 x float> %2646, i32 0, i32 0, i32 0)
+ %2648 = insertelement <4 x float> poison, float %2103, i64 0
+ %2649 = insertelement <4 x float> %2648, float %2104, i64 1
+ %2650 = insertelement <4 x float> %2649, float %2105, i64 2
+ %2651 = insertelement <4 x float> %2650, float %2106, i64 3
+ %2652 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2540, <4 x float> %2651, i32 0, i32 0, i32 0)
+ %2653 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2541, <4 x float> %2652, i32 0, i32 0, i32 0)
+ %2654 = insertelement <4 x float> poison, float %2107, i64 0
+ %2655 = insertelement <4 x float> %2654, float %2108, i64 1
+ %2656 = insertelement <4 x float> %2655, float %2109, i64 2
+ %2657 = insertelement <4 x float> %2656, float %2110, i64 3
+ %2658 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2540, <4 x float> %2657, i32 0, i32 0, i32 0)
+ %2659 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2541, <4 x float> %2658, i32 0, i32 0, i32 0)
+ %2660 = insertelement <4 x float> poison, float %2111, i64 0
+ %2661 = insertelement <4 x float> %2660, float %2112, i64 1
+ %2662 = insertelement <4 x float> %2661, float %2113, i64 2
+ %2663 = insertelement <4 x float> %2662, float %2114, i64 3
+ %2664 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2542, <4 x float> %2663, i32 0, i32 0, i32 0)
+ %2665 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2543, <4 x float> %2664, i32 0, i32 0, i32 0)
+ %2666 = insertelement <4 x float> poison, float %2115, i64 0
+ %2667 = insertelement <4 x float> %2666, float %2116, i64 1
+ %2668 = insertelement <4 x float> %2667, float %2117, i64 2
+ %2669 = insertelement <4 x float> %2668, float %2118, i64 3
+ %2670 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2542, <4 x float> %2669, i32 0, i32 0, i32 0)
+ %2671 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2543, <4 x float> %2670, i32 0, i32 0, i32 0)
+ %2672 = insertelement <4 x float> poison, float %2119, i64 0
+ %2673 = insertelement <4 x float> %2672, float %2120, i64 1
+ %2674 = insertelement <4 x float> %2673, float %2121, i64 2
+ %2675 = insertelement <4 x float> %2674, float %2122, i64 3
+ %2676 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2540, <4 x float> %2675, i32 0, i32 0, i32 0)
+ %2677 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2541, <4 x float> %2676, i32 0, i32 0, i32 0)
+ %2678 = insertelement <4 x float> poison, float %2123, i64 0
+ %2679 = insertelement <4 x float> %2678, float %2124, i64 1
+ %2680 = insertelement <4 x float> %2679, float %2125, i64 2
+ %2681 = insertelement <4 x float> %2680, float %2126, i64 3
+ %2682 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2540, <4 x float> %2681, i32 0, i32 0, i32 0)
+ %2683 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2541, <4 x float> %2682, i32 0, i32 0, i32 0)
+ %2684 = insertelement <4 x float> poison, float %2127, i64 0
+ %2685 = insertelement <4 x float> %2684, float %2128, i64 1
+ %2686 = insertelement <4 x float> %2685, float %2129, i64 2
+ %2687 = insertelement <4 x float> %2686, float %2130, i64 3
+ %2688 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2542, <4 x float> %2687, i32 0, i32 0, i32 0)
+ %2689 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2543, <4 x float> %2688, i32 0, i32 0, i32 0)
+ %2690 = insertelement <4 x float> poison, float %2131, i64 0
+ %2691 = insertelement <4 x float> %2690, float %2132, i64 1
+ %2692 = insertelement <4 x float> %2691, float %2133, i64 2
+ %2693 = insertelement <4 x float> %2692, float %2134, i64 3
+ %2694 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2542, <4 x float> %2693, i32 0, i32 0, i32 0)
+ %2695 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2543, <4 x float> %2694, i32 0, i32 0, i32 0)
+ %2696 = insertelement <4 x float> poison, float %2167, i64 0
+ %2697 = insertelement <4 x float> %2696, float %2168, i64 1
+ %2698 = insertelement <4 x float> %2697, float %2169, i64 2
+ %2699 = insertelement <4 x float> %2698, float %2170, i64 3
+ %2700 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2596, <4 x float> %2699, i32 0, i32 0, i32 0)
+ %2701 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2597, <4 x float> %2700, i32 0, i32 0, i32 0)
+ %2702 = insertelement <4 x float> poison, float %2171, i64 0
+ %2703 = insertelement <4 x float> %2702, float %2172, i64 1
+ %2704 = insertelement <4 x float> %2703, float %2173, i64 2
+ %2705 = insertelement <4 x float> %2704, float %2174, i64 3
+ %2706 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2596, <4 x float> %2705, i32 0, i32 0, i32 0)
+ %2707 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2597, <4 x float> %2706, i32 0, i32 0, i32 0)
+ %2708 = insertelement <4 x float> poison, float %2175, i64 0
+ %2709 = insertelement <4 x float> %2708, float %2176, i64 1
+ %2710 = insertelement <4 x float> %2709, float %2177, i64 2
+ %2711 = insertelement <4 x float> %2710, float %2178, i64 3
+ %2712 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2427, <4 x half> %2598, <4 x float> %2711, i32 0, i32 0, i32 0)
+ %2713 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2428, <4 x half> %2599, <4 x float> %2712, i32 0, i32 0, i32 0)
+ %2714 = insertelement <4 x float> poison, float %2179, i64 0
+ %2715 = insertelement <4 x float> %2714, float %2180, i64 1
+ %2716 = insertelement <4 x float> %2715, float %2181, i64 2
+ %2717 = insertelement <4 x float> %2716, float %2182, i64 3
+ %2718 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2429, <4 x half> %2598, <4 x float> %2717, i32 0, i32 0, i32 0)
+ %2719 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2430, <4 x half> %2599, <4 x float> %2718, i32 0, i32 0, i32 0)
+ %2720 = insertelement <4 x float> poison, float %2183, i64 0
+ %2721 = insertelement <4 x float> %2720, float %2184, i64 1
+ %2722 = insertelement <4 x float> %2721, float %2185, i64 2
+ %2723 = insertelement <4 x float> %2722, float %2186, i64 3
+ %2724 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2596, <4 x float> %2723, i32 0, i32 0, i32 0)
+ %2725 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2597, <4 x float> %2724, i32 0, i32 0, i32 0)
+ %2726 = insertelement <4 x float> poison, float %2187, i64 0
+ %2727 = insertelement <4 x float> %2726, float %2188, i64 1
+ %2728 = insertelement <4 x float> %2727, float %2189, i64 2
+ %2729 = insertelement <4 x float> %2728, float %2190, i64 3
+ %2730 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2596, <4 x float> %2729, i32 0, i32 0, i32 0)
+ %2731 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2597, <4 x float> %2730, i32 0, i32 0, i32 0)
+ %2732 = insertelement <4 x float> poison, float %2191, i64 0
+ %2733 = insertelement <4 x float> %2732, float %2192, i64 1
+ %2734 = insertelement <4 x float> %2733, float %2193, i64 2
+ %2735 = insertelement <4 x float> %2734, float %2194, i64 3
+ %2736 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2460, <4 x half> %2598, <4 x float> %2735, i32 0, i32 0, i32 0)
+ %2737 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2461, <4 x half> %2599, <4 x float> %2736, i32 0, i32 0, i32 0)
+ %2738 = insertelement <4 x float> poison, float %2195, i64 0
+ %2739 = insertelement <4 x float> %2738, float %2196, i64 1
+ %2740 = insertelement <4 x float> %2739, float %2197, i64 2
+ %2741 = insertelement <4 x float> %2740, float %2198, i64 3
+ %2742 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2462, <4 x half> %2598, <4 x float> %2741, i32 0, i32 0, i32 0)
+ %2743 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2463, <4 x half> %2599, <4 x float> %2742, i32 0, i32 0, i32 0)
+ %2744 = or disjoint i32 %.pre-phi1042, 2048
+ %2745 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1042
+ %2746 = load <8 x half>, ptr addrspace(3) %2745, align 16
+ %2747 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2744
+ %2748 = load <8 x half>, ptr addrspace(3) %2747, align 16
+ %2749 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1044
+ %2750 = load <8 x half>, ptr addrspace(3) %2749, align 16
+ %2751 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1046
+ %2752 = load <8 x half>, ptr addrspace(3) %2751, align 16
+ %2753 = shufflevector <8 x half> %2750, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2754 = shufflevector <8 x half> %2750, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2755 = shufflevector <8 x half> %2752, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2756 = shufflevector <8 x half> %2752, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2757 = shufflevector <8 x half> %2746, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2758 = shufflevector <8 x half> %2746, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2759 = shufflevector <8 x half> %2748, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2760 = shufflevector <8 x half> %2748, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2761 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2753, <4 x float> %2323, i32 0, i32 0, i32 0)
+ %2762 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2754, <4 x float> %2761, i32 0, i32 0, i32 0)
+ %2763 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2753, <4 x float> %2329, i32 0, i32 0, i32 0)
+ %2764 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2754, <4 x float> %2763, i32 0, i32 0, i32 0)
+ %2765 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2755, <4 x float> %2335, i32 0, i32 0, i32 0)
+ %2766 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2756, <4 x float> %2765, i32 0, i32 0, i32 0)
+ %2767 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2755, <4 x float> %2341, i32 0, i32 0, i32 0)
+ %2768 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2756, <4 x float> %2767, i32 0, i32 0, i32 0)
+ %2769 = or disjoint i32 %.pre-phi1048, 2048
+ %2770 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1048
+ %2771 = load <8 x half>, ptr addrspace(3) %2770, align 16
+ %2772 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %2769
+ %2773 = load <8 x half>, ptr addrspace(3) %2772, align 16
+ %2774 = shufflevector <8 x half> %2771, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2775 = shufflevector <8 x half> %2771, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2776 = shufflevector <8 x half> %2773, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2777 = shufflevector <8 x half> %2773, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2778 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2753, <4 x float> %2351, i32 0, i32 0, i32 0)
+ %2779 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2754, <4 x float> %2778, i32 0, i32 0, i32 0)
+ %2780 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2753, <4 x float> %2357, i32 0, i32 0, i32 0)
+ %2781 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2754, <4 x float> %2780, i32 0, i32 0, i32 0)
+ %2782 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2755, <4 x float> %2363, i32 0, i32 0, i32 0)
+ %2783 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2756, <4 x float> %2782, i32 0, i32 0, i32 0)
+ %2784 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2755, <4 x float> %2369, i32 0, i32 0, i32 0)
+ %2785 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2756, <4 x float> %2784, i32 0, i32 0, i32 0)
+ %2786 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1050
+ %2787 = load <8 x half>, ptr addrspace(3) %2786, align 16
+ %2788 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1052
+ %2789 = load <8 x half>, ptr addrspace(3) %2788, align 16
+ %2790 = shufflevector <8 x half> %2787, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2791 = shufflevector <8 x half> %2787, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2792 = shufflevector <8 x half> %2789, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2793 = shufflevector <8 x half> %2789, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2794 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2790, <4 x float> %2379, i32 0, i32 0, i32 0)
+ %2795 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2791, <4 x float> %2794, i32 0, i32 0, i32 0)
+ %2796 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2790, <4 x float> %2385, i32 0, i32 0, i32 0)
+ %2797 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2791, <4 x float> %2796, i32 0, i32 0, i32 0)
+ %2798 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2792, <4 x float> %2391, i32 0, i32 0, i32 0)
+ %2799 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2793, <4 x float> %2798, i32 0, i32 0, i32 0)
+ %2800 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2792, <4 x float> %2397, i32 0, i32 0, i32 0)
+ %2801 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2793, <4 x float> %2800, i32 0, i32 0, i32 0)
+ %2802 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2790, <4 x float> %2403, i32 0, i32 0, i32 0)
+ %2803 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2791, <4 x float> %2802, i32 0, i32 0, i32 0)
+ %2804 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2790, <4 x float> %2409, i32 0, i32 0, i32 0)
+ %2805 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2791, <4 x float> %2804, i32 0, i32 0, i32 0)
+ %2806 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2792, <4 x float> %2415, i32 0, i32 0, i32 0)
+ %2807 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2793, <4 x float> %2806, i32 0, i32 0, i32 0)
+ %2808 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2792, <4 x float> %2421, i32 0, i32 0, i32 0)
+ %2809 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2793, <4 x float> %2808, i32 0, i32 0, i32 0)
+ %2810 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1054
+ %2811 = load <8 x half>, ptr addrspace(3) %2810, align 16
+ %2812 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1056
+ %2813 = load <8 x half>, ptr addrspace(3) %2812, align 16
+ %2814 = shufflevector <8 x half> %2811, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2815 = shufflevector <8 x half> %2811, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2816 = shufflevector <8 x half> %2813, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2817 = shufflevector <8 x half> %2813, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2818 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2753, <4 x float> %2436, i32 0, i32 0, i32 0)
+ %2819 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2754, <4 x float> %2818, i32 0, i32 0, i32 0)
+ %2820 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2753, <4 x float> %2442, i32 0, i32 0, i32 0)
+ %2821 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2754, <4 x float> %2820, i32 0, i32 0, i32 0)
+ %2822 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2755, <4 x float> %2448, i32 0, i32 0, i32 0)
+ %2823 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2756, <4 x float> %2822, i32 0, i32 0, i32 0)
+ %2824 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2755, <4 x float> %2454, i32 0, i32 0, i32 0)
+ %2825 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2756, <4 x float> %2824, i32 0, i32 0, i32 0)
+ %2826 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1058
+ %2827 = load <8 x half>, ptr addrspace(3) %2826, align 16
+ %2828 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.pre-phi1060
+ %2829 = load <8 x half>, ptr addrspace(3) %2828, align 16
+ %2830 = shufflevector <8 x half> %2827, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2831 = shufflevector <8 x half> %2827, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2832 = shufflevector <8 x half> %2829, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2833 = shufflevector <8 x half> %2829, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2834 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2753, <4 x float> %2469, i32 0, i32 0, i32 0)
+ %2835 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2754, <4 x float> %2834, i32 0, i32 0, i32 0)
+ %2836 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2753, <4 x float> %2475, i32 0, i32 0, i32 0)
+ %2837 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2754, <4 x float> %2836, i32 0, i32 0, i32 0)
+ %2838 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2755, <4 x float> %2481, i32 0, i32 0, i32 0)
+ %2839 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2756, <4 x float> %2838, i32 0, i32 0, i32 0)
+ %2840 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2755, <4 x float> %2487, i32 0, i32 0, i32 0)
+ %2841 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2756, <4 x float> %2840, i32 0, i32 0, i32 0)
+ %2842 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2790, <4 x float> %2493, i32 0, i32 0, i32 0)
+ %2843 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2791, <4 x float> %2842, i32 0, i32 0, i32 0)
+ %2844 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2790, <4 x float> %2499, i32 0, i32 0, i32 0)
+ %2845 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2791, <4 x float> %2844, i32 0, i32 0, i32 0)
+ %2846 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2792, <4 x float> %2505, i32 0, i32 0, i32 0)
+ %2847 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2793, <4 x float> %2846, i32 0, i32 0, i32 0)
+ %2848 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2792, <4 x float> %2511, i32 0, i32 0, i32 0)
+ %2849 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2793, <4 x float> %2848, i32 0, i32 0, i32 0)
+ %2850 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2790, <4 x float> %2517, i32 0, i32 0, i32 0)
+ %2851 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2791, <4 x float> %2850, i32 0, i32 0, i32 0)
+ %2852 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2790, <4 x float> %2523, i32 0, i32 0, i32 0)
+ %2853 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2791, <4 x float> %2852, i32 0, i32 0, i32 0)
+ %2854 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2792, <4 x float> %2529, i32 0, i32 0, i32 0)
+ %2855 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2793, <4 x float> %2854, i32 0, i32 0, i32 0)
+ %2856 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2792, <4 x float> %2535, i32 0, i32 0, i32 0)
+ %2857 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2793, <4 x float> %2856, i32 0, i32 0, i32 0)
+ %2858 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1062
+ %2859 = load <8 x half>, ptr addrspace(3) %2858, align 16
+ %2860 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1064
+ %2861 = load <8 x half>, ptr addrspace(3) %2860, align 16
+ %2862 = shufflevector <8 x half> %2859, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2863 = shufflevector <8 x half> %2859, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2864 = shufflevector <8 x half> %2861, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2865 = shufflevector <8 x half> %2861, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2866 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2862, <4 x float> %2549, i32 0, i32 0, i32 0)
+ %2867 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2863, <4 x float> %2866, i32 0, i32 0, i32 0)
+ %2868 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2862, <4 x float> %2555, i32 0, i32 0, i32 0)
+ %2869 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2863, <4 x float> %2868, i32 0, i32 0, i32 0)
+ %2870 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2864, <4 x float> %2561, i32 0, i32 0, i32 0)
+ %2871 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2865, <4 x float> %2870, i32 0, i32 0, i32 0)
+ %2872 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2864, <4 x float> %2567, i32 0, i32 0, i32 0)
+ %2873 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2865, <4 x float> %2872, i32 0, i32 0, i32 0)
+ %2874 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2862, <4 x float> %2573, i32 0, i32 0, i32 0)
+ %2875 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2863, <4 x float> %2874, i32 0, i32 0, i32 0)
+ %2876 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2862, <4 x float> %2579, i32 0, i32 0, i32 0)
+ %2877 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2863, <4 x float> %2876, i32 0, i32 0, i32 0)
+ %2878 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2864, <4 x float> %2585, i32 0, i32 0, i32 0)
+ %2879 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2865, <4 x float> %2878, i32 0, i32 0, i32 0)
+ %2880 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2864, <4 x float> %2591, i32 0, i32 0, i32 0)
+ %2881 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2865, <4 x float> %2880, i32 0, i32 0, i32 0)
+ %2882 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1066
+ %2883 = load <8 x half>, ptr addrspace(3) %2882, align 16
+ %2884 = getelementptr half, ptr addrspace(3) @global_smem, i32 %.pre-phi1068
+ %2885 = load <8 x half>, ptr addrspace(3) %2884, align 16
+ %2886 = shufflevector <8 x half> %2883, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2887 = shufflevector <8 x half> %2883, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2888 = shufflevector <8 x half> %2885, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2889 = shufflevector <8 x half> %2885, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %2890 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2886, <4 x float> %2605, i32 0, i32 0, i32 0)
+ %2891 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2887, <4 x float> %2890, i32 0, i32 0, i32 0)
+ %2892 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2886, <4 x float> %2611, i32 0, i32 0, i32 0)
+ %2893 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2887, <4 x float> %2892, i32 0, i32 0, i32 0)
+ %2894 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2757, <4 x half> %2888, <4 x float> %2617, i32 0, i32 0, i32 0)
+ %2895 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2758, <4 x half> %2889, <4 x float> %2894, i32 0, i32 0, i32 0)
+ %2896 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2759, <4 x half> %2888, <4 x float> %2623, i32 0, i32 0, i32 0)
+ %2897 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2760, <4 x half> %2889, <4 x float> %2896, i32 0, i32 0, i32 0)
+ %2898 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2886, <4 x float> %2629, i32 0, i32 0, i32 0)
+ %2899 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2887, <4 x float> %2898, i32 0, i32 0, i32 0)
+ %2900 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2886, <4 x float> %2635, i32 0, i32 0, i32 0)
+ %2901 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2887, <4 x float> %2900, i32 0, i32 0, i32 0)
+ %2902 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2774, <4 x half> %2888, <4 x float> %2641, i32 0, i32 0, i32 0)
+ %2903 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2775, <4 x half> %2889, <4 x float> %2902, i32 0, i32 0, i32 0)
+ %2904 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2776, <4 x half> %2888, <4 x float> %2647, i32 0, i32 0, i32 0)
+ %2905 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2777, <4 x half> %2889, <4 x float> %2904, i32 0, i32 0, i32 0)
+ %2906 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2862, <4 x float> %2653, i32 0, i32 0, i32 0)
+ %2907 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2863, <4 x float> %2906, i32 0, i32 0, i32 0)
+ %2908 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2862, <4 x float> %2659, i32 0, i32 0, i32 0)
+ %2909 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2863, <4 x float> %2908, i32 0, i32 0, i32 0)
+ %2910 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2864, <4 x float> %2665, i32 0, i32 0, i32 0)
+ %2911 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2865, <4 x float> %2910, i32 0, i32 0, i32 0)
+ %2912 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2864, <4 x float> %2671, i32 0, i32 0, i32 0)
+ %2913 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2865, <4 x float> %2912, i32 0, i32 0, i32 0)
+ %2914 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2862, <4 x float> %2677, i32 0, i32 0, i32 0)
+ %2915 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2863, <4 x float> %2914, i32 0, i32 0, i32 0)
+ %2916 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2862, <4 x float> %2683, i32 0, i32 0, i32 0)
+ %2917 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2863, <4 x float> %2916, i32 0, i32 0, i32 0)
+ %2918 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2864, <4 x float> %2689, i32 0, i32 0, i32 0)
+ %2919 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2865, <4 x float> %2918, i32 0, i32 0, i32 0)
+ %2920 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2864, <4 x float> %2695, i32 0, i32 0, i32 0)
+ %2921 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2865, <4 x float> %2920, i32 0, i32 0, i32 0)
+ %2922 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2886, <4 x float> %2701, i32 0, i32 0, i32 0)
+ %2923 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2887, <4 x float> %2922, i32 0, i32 0, i32 0)
+ %2924 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2886, <4 x float> %2707, i32 0, i32 0, i32 0)
+ %2925 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2887, <4 x float> %2924, i32 0, i32 0, i32 0)
+ %2926 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2814, <4 x half> %2888, <4 x float> %2713, i32 0, i32 0, i32 0)
+ %2927 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2815, <4 x half> %2889, <4 x float> %2926, i32 0, i32 0, i32 0)
+ %2928 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2816, <4 x half> %2888, <4 x float> %2719, i32 0, i32 0, i32 0)
+ %2929 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2817, <4 x half> %2889, <4 x float> %2928, i32 0, i32 0, i32 0)
+ %2930 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2886, <4 x float> %2725, i32 0, i32 0, i32 0)
+ %2931 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2887, <4 x float> %2930, i32 0, i32 0, i32 0)
+ %2932 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2886, <4 x float> %2731, i32 0, i32 0, i32 0)
+ %2933 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2887, <4 x float> %2932, i32 0, i32 0, i32 0)
+ %2934 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2830, <4 x half> %2888, <4 x float> %2737, i32 0, i32 0, i32 0)
+ %2935 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2831, <4 x half> %2889, <4 x float> %2934, i32 0, i32 0, i32 0)
+ %2936 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2832, <4 x half> %2888, <4 x float> %2743, i32 0, i32 0, i32 0)
+ %2937 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %2833, <4 x half> %2889, <4 x float> %2936, i32 0, i32 0, i32 0)
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %2938 = shufflevector <2 x half> %2199, <2 x half> %2263, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2939 = shufflevector <2 x half> %2264, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2940 = shufflevector <8 x half> %2938, <8 x half> %2939, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2941 = shufflevector <2 x half> %2200, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2942 = shufflevector <8 x half> %2940, <8 x half> %2941, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2942, ptr addrspace(3) %199, align 16
+ %2943 = shufflevector <2 x half> %2201, <2 x half> %2265, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2944 = shufflevector <2 x half> %2266, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2945 = shufflevector <8 x half> %2943, <8 x half> %2944, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2946 = shufflevector <2 x half> %2202, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2947 = shufflevector <8 x half> %2945, <8 x half> %2946, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2947, ptr addrspace(3) %201, align 16
+ %2948 = shufflevector <2 x half> %2203, <2 x half> %2267, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2949 = shufflevector <2 x half> %2268, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2950 = shufflevector <8 x half> %2948, <8 x half> %2949, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2951 = shufflevector <2 x half> %2204, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2952 = shufflevector <8 x half> %2950, <8 x half> %2951, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2952, ptr addrspace(3) %203, align 16
+ %2953 = shufflevector <2 x half> %2205, <2 x half> %2269, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2954 = shufflevector <2 x half> %2270, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2955 = shufflevector <8 x half> %2953, <8 x half> %2954, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2956 = shufflevector <2 x half> %2206, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2957 = shufflevector <8 x half> %2955, <8 x half> %2956, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2957, ptr addrspace(3) %205, align 16
+ %2958 = shufflevector <2 x half> %2207, <2 x half> %2271, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2959 = shufflevector <2 x half> %2272, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2960 = shufflevector <8 x half> %2958, <8 x half> %2959, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2961 = shufflevector <2 x half> %2208, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2962 = shufflevector <8 x half> %2960, <8 x half> %2961, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2962, ptr addrspace(3) %207, align 16
+ %2963 = shufflevector <2 x half> %2209, <2 x half> %2273, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2964 = shufflevector <2 x half> %2274, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2965 = shufflevector <8 x half> %2963, <8 x half> %2964, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2966 = shufflevector <2 x half> %2210, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2967 = shufflevector <8 x half> %2965, <8 x half> %2966, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2967, ptr addrspace(3) %209, align 16
+ %2968 = shufflevector <2 x half> %2211, <2 x half> %2275, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2969 = shufflevector <2 x half> %2276, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2970 = shufflevector <8 x half> %2968, <8 x half> %2969, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2971 = shufflevector <2 x half> %2212, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2972 = shufflevector <8 x half> %2970, <8 x half> %2971, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2972, ptr addrspace(3) %211, align 16
+ %2973 = shufflevector <2 x half> %2213, <2 x half> %2277, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2974 = shufflevector <2 x half> %2278, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2975 = shufflevector <8 x half> %2973, <8 x half> %2974, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2976 = shufflevector <2 x half> %2214, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2977 = shufflevector <8 x half> %2975, <8 x half> %2976, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2977, ptr addrspace(3) %213, align 16
+ %2978 = shufflevector <2 x half> %2215, <2 x half> %2279, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2979 = shufflevector <2 x half> %2280, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2980 = shufflevector <8 x half> %2978, <8 x half> %2979, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2981 = shufflevector <2 x half> %2216, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2982 = shufflevector <8 x half> %2980, <8 x half> %2981, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2982, ptr addrspace(3) %214, align 16
+ %2983 = shufflevector <2 x half> %2217, <2 x half> %2281, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2984 = shufflevector <2 x half> %2282, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2985 = shufflevector <8 x half> %2983, <8 x half> %2984, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2986 = shufflevector <2 x half> %2218, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2987 = shufflevector <8 x half> %2985, <8 x half> %2986, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2987, ptr addrspace(3) %215, align 16
+ %2988 = shufflevector <2 x half> %2219, <2 x half> %2283, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2989 = shufflevector <2 x half> %2284, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2990 = shufflevector <8 x half> %2988, <8 x half> %2989, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2991 = shufflevector <2 x half> %2220, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2992 = shufflevector <8 x half> %2990, <8 x half> %2991, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2992, ptr addrspace(3) %216, align 16
+ %2993 = shufflevector <2 x half> %2221, <2 x half> %2285, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2994 = shufflevector <2 x half> %2286, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2995 = shufflevector <8 x half> %2993, <8 x half> %2994, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %2996 = shufflevector <2 x half> %2222, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2997 = shufflevector <8 x half> %2995, <8 x half> %2996, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %2997, ptr addrspace(3) %217, align 16
+ %2998 = shufflevector <2 x half> %2223, <2 x half> %2287, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %2999 = shufflevector <2 x half> %2288, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3000 = shufflevector <8 x half> %2998, <8 x half> %2999, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %3001 = shufflevector <2 x half> %2224, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3002 = shufflevector <8 x half> %3000, <8 x half> %3001, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %3002, ptr addrspace(3) %218, align 16
+ %3003 = shufflevector <2 x half> %2225, <2 x half> %2289, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3004 = shufflevector <2 x half> %2290, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3005 = shufflevector <8 x half> %3003, <8 x half> %3004, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %3006 = shufflevector <2 x half> %2226, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3007 = shufflevector <8 x half> %3005, <8 x half> %3006, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %3007, ptr addrspace(3) %219, align 16
+ %3008 = shufflevector <2 x half> %2227, <2 x half> %2291, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3009 = shufflevector <2 x half> %2292, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3010 = shufflevector <8 x half> %3008, <8 x half> %3009, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %3011 = shufflevector <2 x half> %2228, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3012 = shufflevector <8 x half> %3010, <8 x half> %3011, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %3012, ptr addrspace(3) %220, align 16
+ %3013 = shufflevector <2 x half> %2229, <2 x half> %2293, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3014 = shufflevector <2 x half> %2294, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3015 = shufflevector <8 x half> %3013, <8 x half> %3014, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+ %3016 = shufflevector <2 x half> %2230, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+ %3017 = shufflevector <8 x half> %3015, <8 x half> %3016, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ store <8 x half> %3017, ptr addrspace(3) %221, align 16
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %3018 = load <8 x half>, ptr addrspace(3) %243, align 16
+ %3019 = load <8 x half>, ptr addrspace(3) %245, align 16
+ %3020 = load <8 x half>, ptr addrspace(3) %233, align 16
+ %3021 = load <8 x half>, ptr addrspace(3) %235, align 16
+ %3022 = shufflevector <8 x half> %3020, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3023 = shufflevector <8 x half> %3020, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3024 = shufflevector <8 x half> %3021, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3025 = shufflevector <8 x half> %3021, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3026 = shufflevector <8 x half> %3018, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3027 = shufflevector <8 x half> %3018, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3028 = shufflevector <8 x half> %3019, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3029 = shufflevector <8 x half> %3019, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3030 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3022, <4 x float> %2762, i32 0, i32 0, i32 0)
+ %3031 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3023, <4 x float> %3030, i32 0, i32 0, i32 0)
+ %3032 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3022, <4 x float> %2764, i32 0, i32 0, i32 0)
+ %3033 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3023, <4 x float> %3032, i32 0, i32 0, i32 0)
+ %3034 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3024, <4 x float> %2766, i32 0, i32 0, i32 0)
+ %3035 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3025, <4 x float> %3034, i32 0, i32 0, i32 0)
+ %3036 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3024, <4 x float> %2768, i32 0, i32 0, i32 0)
+ %3037 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3025, <4 x float> %3036, i32 0, i32 0, i32 0)
+ %3038 = load <8 x half>, ptr addrspace(3) %258, align 16
+ %3039 = load <8 x half>, ptr addrspace(3) %260, align 16
+ %3040 = shufflevector <8 x half> %3038, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3041 = shufflevector <8 x half> %3038, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3042 = shufflevector <8 x half> %3039, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3043 = shufflevector <8 x half> %3039, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3044 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3022, <4 x float> %2779, i32 0, i32 0, i32 0)
+ %3045 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3023, <4 x float> %3044, i32 0, i32 0, i32 0)
+ %3046 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3022, <4 x float> %2781, i32 0, i32 0, i32 0)
+ %3047 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3023, <4 x float> %3046, i32 0, i32 0, i32 0)
+ %3048 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3024, <4 x float> %2783, i32 0, i32 0, i32 0)
+ %3049 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3025, <4 x float> %3048, i32 0, i32 0, i32 0)
+ %3050 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3024, <4 x float> %2785, i32 0, i32 0, i32 0)
+ %3051 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3025, <4 x float> %3050, i32 0, i32 0, i32 0)
+ %3052 = load <8 x half>, ptr addrspace(3) %2423, align 16
+ %3053 = load <8 x half>, ptr addrspace(3) %2425, align 16
+ %3054 = shufflevector <8 x half> %3052, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3055 = shufflevector <8 x half> %3052, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3056 = shufflevector <8 x half> %3053, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3057 = shufflevector <8 x half> %3053, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3058 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3022, <4 x float> %2819, i32 0, i32 0, i32 0)
+ %3059 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3023, <4 x float> %3058, i32 0, i32 0, i32 0)
+ %3060 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3022, <4 x float> %2821, i32 0, i32 0, i32 0)
+ %3061 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3023, <4 x float> %3060, i32 0, i32 0, i32 0)
+ %3062 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3024, <4 x float> %2823, i32 0, i32 0, i32 0)
+ %3063 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3025, <4 x float> %3062, i32 0, i32 0, i32 0)
+ %3064 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3024, <4 x float> %2825, i32 0, i32 0, i32 0)
+ %3065 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3025, <4 x float> %3064, i32 0, i32 0, i32 0)
+ %3066 = load <8 x half>, ptr addrspace(3) %2456, align 16
+ %3067 = load <8 x half>, ptr addrspace(3) %2458, align 16
+ %3068 = shufflevector <8 x half> %3066, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3069 = shufflevector <8 x half> %3066, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3070 = shufflevector <8 x half> %3067, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3071 = shufflevector <8 x half> %3067, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3072 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3022, <4 x float> %2835, i32 0, i32 0, i32 0)
+ %3073 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3023, <4 x float> %3072, i32 0, i32 0, i32 0)
+ %3074 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3022, <4 x float> %2837, i32 0, i32 0, i32 0)
+ %3075 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3023, <4 x float> %3074, i32 0, i32 0, i32 0)
+ %3076 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3024, <4 x float> %2839, i32 0, i32 0, i32 0)
+ %3077 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3025, <4 x float> %3076, i32 0, i32 0, i32 0)
+ %3078 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3024, <4 x float> %2841, i32 0, i32 0, i32 0)
+ %3079 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3025, <4 x float> %3078, i32 0, i32 0, i32 0)
+ %3080 = load <8 x half>, ptr addrspace(3) %251, align 16
+ %3081 = load <8 x half>, ptr addrspace(3) %253, align 16
+ %3082 = shufflevector <8 x half> %3080, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3083 = shufflevector <8 x half> %3080, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3084 = shufflevector <8 x half> %3081, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3085 = shufflevector <8 x half> %3081, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3086 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3082, <4 x float> %2795, i32 0, i32 0, i32 0)
+ %3087 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3083, <4 x float> %3086, i32 0, i32 0, i32 0)
+ %3088 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3082, <4 x float> %2797, i32 0, i32 0, i32 0)
+ %3089 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3083, <4 x float> %3088, i32 0, i32 0, i32 0)
+ %3090 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3084, <4 x float> %2799, i32 0, i32 0, i32 0)
+ %3091 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3085, <4 x float> %3090, i32 0, i32 0, i32 0)
+ %3092 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3084, <4 x float> %2801, i32 0, i32 0, i32 0)
+ %3093 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3085, <4 x float> %3092, i32 0, i32 0, i32 0)
+ %3094 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3082, <4 x float> %2803, i32 0, i32 0, i32 0)
+ %3095 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3083, <4 x float> %3094, i32 0, i32 0, i32 0)
+ %3096 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3082, <4 x float> %2805, i32 0, i32 0, i32 0)
+ %3097 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3083, <4 x float> %3096, i32 0, i32 0, i32 0)
+ %3098 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3084, <4 x float> %2807, i32 0, i32 0, i32 0)
+ %3099 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3085, <4 x float> %3098, i32 0, i32 0, i32 0)
+ %3100 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3084, <4 x float> %2809, i32 0, i32 0, i32 0)
+ %3101 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3085, <4 x float> %3100, i32 0, i32 0, i32 0)
+ %3102 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3082, <4 x float> %2843, i32 0, i32 0, i32 0)
+ %3103 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3083, <4 x float> %3102, i32 0, i32 0, i32 0)
+ %3104 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3082, <4 x float> %2845, i32 0, i32 0, i32 0)
+ %3105 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3083, <4 x float> %3104, i32 0, i32 0, i32 0)
+ %3106 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3084, <4 x float> %2847, i32 0, i32 0, i32 0)
+ %3107 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3085, <4 x float> %3106, i32 0, i32 0, i32 0)
+ %3108 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3084, <4 x float> %2849, i32 0, i32 0, i32 0)
+ %3109 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3085, <4 x float> %3108, i32 0, i32 0, i32 0)
+ %3110 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3082, <4 x float> %2851, i32 0, i32 0, i32 0)
+ %3111 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3083, <4 x float> %3110, i32 0, i32 0, i32 0)
+ %3112 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3082, <4 x float> %2853, i32 0, i32 0, i32 0)
+ %3113 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3083, <4 x float> %3112, i32 0, i32 0, i32 0)
+ %3114 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3084, <4 x float> %2855, i32 0, i32 0, i32 0)
+ %3115 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3085, <4 x float> %3114, i32 0, i32 0, i32 0)
+ %3116 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3084, <4 x float> %2857, i32 0, i32 0, i32 0)
+ %3117 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3085, <4 x float> %3116, i32 0, i32 0, i32 0)
+ %3118 = load <8 x half>, ptr addrspace(3) %2536, align 16
+ %3119 = load <8 x half>, ptr addrspace(3) %2538, align 16
+ %3120 = shufflevector <8 x half> %3118, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3121 = shufflevector <8 x half> %3118, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3122 = shufflevector <8 x half> %3119, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3123 = shufflevector <8 x half> %3119, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3124 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3120, <4 x float> %2867, i32 0, i32 0, i32 0)
+ %3125 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3121, <4 x float> %3124, i32 0, i32 0, i32 0)
+ %3126 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3120, <4 x float> %2869, i32 0, i32 0, i32 0)
+ %3127 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3121, <4 x float> %3126, i32 0, i32 0, i32 0)
+ %3128 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3122, <4 x float> %2871, i32 0, i32 0, i32 0)
+ %3129 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3123, <4 x float> %3128, i32 0, i32 0, i32 0)
+ %3130 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3122, <4 x float> %2873, i32 0, i32 0, i32 0)
+ %3131 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3123, <4 x float> %3130, i32 0, i32 0, i32 0)
+ %3132 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3120, <4 x float> %2875, i32 0, i32 0, i32 0)
+ %3133 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3121, <4 x float> %3132, i32 0, i32 0, i32 0)
+ %3134 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3120, <4 x float> %2877, i32 0, i32 0, i32 0)
+ %3135 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3121, <4 x float> %3134, i32 0, i32 0, i32 0)
+ %3136 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3122, <4 x float> %2879, i32 0, i32 0, i32 0)
+ %3137 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3123, <4 x float> %3136, i32 0, i32 0, i32 0)
+ %3138 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3122, <4 x float> %2881, i32 0, i32 0, i32 0)
+ %3139 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3123, <4 x float> %3138, i32 0, i32 0, i32 0)
+ %3140 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3120, <4 x float> %2907, i32 0, i32 0, i32 0)
+ %3141 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3121, <4 x float> %3140, i32 0, i32 0, i32 0)
+ %3142 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3120, <4 x float> %2909, i32 0, i32 0, i32 0)
+ %3143 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3121, <4 x float> %3142, i32 0, i32 0, i32 0)
+ %3144 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3122, <4 x float> %2911, i32 0, i32 0, i32 0)
+ %3145 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3123, <4 x float> %3144, i32 0, i32 0, i32 0)
+ %3146 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3122, <4 x float> %2913, i32 0, i32 0, i32 0)
+ %3147 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3123, <4 x float> %3146, i32 0, i32 0, i32 0)
+ %3148 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3120, <4 x float> %2915, i32 0, i32 0, i32 0)
+ %3149 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3121, <4 x float> %3148, i32 0, i32 0, i32 0)
+ %3150 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3120, <4 x float> %2917, i32 0, i32 0, i32 0)
+ %3151 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3121, <4 x float> %3150, i32 0, i32 0, i32 0)
+ %3152 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3122, <4 x float> %2919, i32 0, i32 0, i32 0)
+ %3153 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3123, <4 x float> %3152, i32 0, i32 0, i32 0)
+ %3154 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3122, <4 x float> %2921, i32 0, i32 0, i32 0)
+ %3155 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3123, <4 x float> %3154, i32 0, i32 0, i32 0)
+ %3156 = load <8 x half>, ptr addrspace(3) %2592, align 16
+ %3157 = load <8 x half>, ptr addrspace(3) %2594, align 16
+ %3158 = shufflevector <8 x half> %3156, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3159 = shufflevector <8 x half> %3156, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3160 = shufflevector <8 x half> %3157, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3161 = shufflevector <8 x half> %3157, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3162 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3158, <4 x float> %2891, i32 0, i32 0, i32 0)
+ %3163 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3159, <4 x float> %3162, i32 0, i32 0, i32 0)
+ %3164 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3158, <4 x float> %2893, i32 0, i32 0, i32 0)
+ %3165 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3159, <4 x float> %3164, i32 0, i32 0, i32 0)
+ %3166 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3026, <4 x half> %3160, <4 x float> %2895, i32 0, i32 0, i32 0)
+ %3167 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3027, <4 x half> %3161, <4 x float> %3166, i32 0, i32 0, i32 0)
+ %3168 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3028, <4 x half> %3160, <4 x float> %2897, i32 0, i32 0, i32 0)
+ %3169 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3029, <4 x half> %3161, <4 x float> %3168, i32 0, i32 0, i32 0)
+ %3170 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3158, <4 x float> %2899, i32 0, i32 0, i32 0)
+ %3171 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3159, <4 x float> %3170, i32 0, i32 0, i32 0)
+ %3172 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3158, <4 x float> %2901, i32 0, i32 0, i32 0)
+ %3173 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3159, <4 x float> %3172, i32 0, i32 0, i32 0)
+ %3174 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3040, <4 x half> %3160, <4 x float> %2903, i32 0, i32 0, i32 0)
+ %3175 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3041, <4 x half> %3161, <4 x float> %3174, i32 0, i32 0, i32 0)
+ %3176 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3042, <4 x half> %3160, <4 x float> %2905, i32 0, i32 0, i32 0)
+ %3177 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3043, <4 x half> %3161, <4 x float> %3176, i32 0, i32 0, i32 0)
+ %3178 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3158, <4 x float> %2923, i32 0, i32 0, i32 0)
+ %3179 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3159, <4 x float> %3178, i32 0, i32 0, i32 0)
+ %3180 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3158, <4 x float> %2925, i32 0, i32 0, i32 0)
+ %3181 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3159, <4 x float> %3180, i32 0, i32 0, i32 0)
+ %3182 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3054, <4 x half> %3160, <4 x float> %2927, i32 0, i32 0, i32 0)
+ %3183 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3055, <4 x half> %3161, <4 x float> %3182, i32 0, i32 0, i32 0)
+ %3184 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3056, <4 x half> %3160, <4 x float> %2929, i32 0, i32 0, i32 0)
+ %3185 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3057, <4 x half> %3161, <4 x float> %3184, i32 0, i32 0, i32 0)
+ %3186 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3158, <4 x float> %2931, i32 0, i32 0, i32 0)
+ %3187 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3159, <4 x float> %3186, i32 0, i32 0, i32 0)
+ %3188 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3158, <4 x float> %2933, i32 0, i32 0, i32 0)
+ %3189 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3159, <4 x float> %3188, i32 0, i32 0, i32 0)
+ %3190 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3068, <4 x half> %3160, <4 x float> %2935, i32 0, i32 0, i32 0)
+ %3191 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3069, <4 x half> %3161, <4 x float> %3190, i32 0, i32 0, i32 0)
+ %3192 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3070, <4 x half> %3160, <4 x float> %2937, i32 0, i32 0, i32 0)
+ %3193 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3071, <4 x half> %3161, <4 x float> %3192, i32 0, i32 0, i32 0)
+ %3194 = load <8 x half>, ptr addrspace(3) %2745, align 16
+ %3195 = load <8 x half>, ptr addrspace(3) %2747, align 16
+ %3196 = load <8 x half>, ptr addrspace(3) %2749, align 16
+ %3197 = load <8 x half>, ptr addrspace(3) %2751, align 16
+ %3198 = shufflevector <8 x half> %3196, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3199 = shufflevector <8 x half> %3196, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3200 = shufflevector <8 x half> %3197, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3201 = shufflevector <8 x half> %3197, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3202 = shufflevector <8 x half> %3194, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3203 = shufflevector <8 x half> %3194, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3204 = shufflevector <8 x half> %3195, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3205 = shufflevector <8 x half> %3195, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3206 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3198, <4 x float> %3031, i32 0, i32 0, i32 0)
+ %3207 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3199, <4 x float> %3206, i32 0, i32 0, i32 0)
+ %3208 = extractelement <4 x float> %3207, i64 0
+ %3209 = extractelement <4 x float> %3207, i64 1
+ %3210 = extractelement <4 x float> %3207, i64 2
+ %3211 = extractelement <4 x float> %3207, i64 3
+ %3212 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3198, <4 x float> %3033, i32 0, i32 0, i32 0)
+ %3213 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3199, <4 x float> %3212, i32 0, i32 0, i32 0)
+ %3214 = extractelement <4 x float> %3213, i64 0
+ %3215 = extractelement <4 x float> %3213, i64 1
+ %3216 = extractelement <4 x float> %3213, i64 2
+ %3217 = extractelement <4 x float> %3213, i64 3
+ %3218 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3200, <4 x float> %3035, i32 0, i32 0, i32 0)
+ %3219 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3201, <4 x float> %3218, i32 0, i32 0, i32 0)
+ %3220 = extractelement <4 x float> %3219, i64 0
+ %3221 = extractelement <4 x float> %3219, i64 1
+ %3222 = extractelement <4 x float> %3219, i64 2
+ %3223 = extractelement <4 x float> %3219, i64 3
+ %3224 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3200, <4 x float> %3037, i32 0, i32 0, i32 0)
+ %3225 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3201, <4 x float> %3224, i32 0, i32 0, i32 0)
+ %3226 = extractelement <4 x float> %3225, i64 0
+ %3227 = extractelement <4 x float> %3225, i64 1
+ %3228 = extractelement <4 x float> %3225, i64 2
+ %3229 = extractelement <4 x float> %3225, i64 3
+ %3230 = load <8 x half>, ptr addrspace(3) %2770, align 16
+ %3231 = load <8 x half>, ptr addrspace(3) %2772, align 16
+ %3232 = shufflevector <8 x half> %3230, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3233 = shufflevector <8 x half> %3230, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3234 = shufflevector <8 x half> %3231, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3235 = shufflevector <8 x half> %3231, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3236 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3198, <4 x float> %3045, i32 0, i32 0, i32 0)
+ %3237 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3199, <4 x float> %3236, i32 0, i32 0, i32 0)
+ %3238 = extractelement <4 x float> %3237, i64 0
+ %3239 = extractelement <4 x float> %3237, i64 1
+ %3240 = extractelement <4 x float> %3237, i64 2
+ %3241 = extractelement <4 x float> %3237, i64 3
+ %3242 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3198, <4 x float> %3047, i32 0, i32 0, i32 0)
+ %3243 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3199, <4 x float> %3242, i32 0, i32 0, i32 0)
+ %3244 = extractelement <4 x float> %3243, i64 0
+ %3245 = extractelement <4 x float> %3243, i64 1
+ %3246 = extractelement <4 x float> %3243, i64 2
+ %3247 = extractelement <4 x float> %3243, i64 3
+ %3248 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3200, <4 x float> %3049, i32 0, i32 0, i32 0)
+ %3249 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3201, <4 x float> %3248, i32 0, i32 0, i32 0)
+ %3250 = extractelement <4 x float> %3249, i64 0
+ %3251 = extractelement <4 x float> %3249, i64 1
+ %3252 = extractelement <4 x float> %3249, i64 2
+ %3253 = extractelement <4 x float> %3249, i64 3
+ %3254 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3200, <4 x float> %3051, i32 0, i32 0, i32 0)
+ %3255 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3201, <4 x float> %3254, i32 0, i32 0, i32 0)
+ %3256 = extractelement <4 x float> %3255, i64 0
+ %3257 = extractelement <4 x float> %3255, i64 1
+ %3258 = extractelement <4 x float> %3255, i64 2
+ %3259 = extractelement <4 x float> %3255, i64 3
+ %3260 = load <8 x half>, ptr addrspace(3) %2786, align 16
+ %3261 = load <8 x half>, ptr addrspace(3) %2788, align 16
+ %3262 = shufflevector <8 x half> %3260, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3263 = shufflevector <8 x half> %3260, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3264 = shufflevector <8 x half> %3261, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3265 = shufflevector <8 x half> %3261, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3266 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3262, <4 x float> %3087, i32 0, i32 0, i32 0)
+ %3267 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3263, <4 x float> %3266, i32 0, i32 0, i32 0)
+ %3268 = extractelement <4 x float> %3267, i64 0
+ %3269 = extractelement <4 x float> %3267, i64 1
+ %3270 = extractelement <4 x float> %3267, i64 2
+ %3271 = extractelement <4 x float> %3267, i64 3
+ %3272 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3262, <4 x float> %3089, i32 0, i32 0, i32 0)
+ %3273 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3263, <4 x float> %3272, i32 0, i32 0, i32 0)
+ %3274 = extractelement <4 x float> %3273, i64 0
+ %3275 = extractelement <4 x float> %3273, i64 1
+ %3276 = extractelement <4 x float> %3273, i64 2
+ %3277 = extractelement <4 x float> %3273, i64 3
+ %3278 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3264, <4 x float> %3091, i32 0, i32 0, i32 0)
+ %3279 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3265, <4 x float> %3278, i32 0, i32 0, i32 0)
+ %3280 = extractelement <4 x float> %3279, i64 0
+ %3281 = extractelement <4 x float> %3279, i64 1
+ %3282 = extractelement <4 x float> %3279, i64 2
+ %3283 = extractelement <4 x float> %3279, i64 3
+ %3284 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3264, <4 x float> %3093, i32 0, i32 0, i32 0)
+ %3285 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3265, <4 x float> %3284, i32 0, i32 0, i32 0)
+ %3286 = extractelement <4 x float> %3285, i64 0
+ %3287 = extractelement <4 x float> %3285, i64 1
+ %3288 = extractelement <4 x float> %3285, i64 2
+ %3289 = extractelement <4 x float> %3285, i64 3
+ %3290 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3262, <4 x float> %3095, i32 0, i32 0, i32 0)
+ %3291 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3263, <4 x float> %3290, i32 0, i32 0, i32 0)
+ %3292 = extractelement <4 x float> %3291, i64 0
+ %3293 = extractelement <4 x float> %3291, i64 1
+ %3294 = extractelement <4 x float> %3291, i64 2
+ %3295 = extractelement <4 x float> %3291, i64 3
+ %3296 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3262, <4 x float> %3097, i32 0, i32 0, i32 0)
+ %3297 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3263, <4 x float> %3296, i32 0, i32 0, i32 0)
+ %3298 = extractelement <4 x float> %3297, i64 0
+ %3299 = extractelement <4 x float> %3297, i64 1
+ %3300 = extractelement <4 x float> %3297, i64 2
+ %3301 = extractelement <4 x float> %3297, i64 3
+ %3302 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3264, <4 x float> %3099, i32 0, i32 0, i32 0)
+ %3303 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3265, <4 x float> %3302, i32 0, i32 0, i32 0)
+ %3304 = extractelement <4 x float> %3303, i64 0
+ %3305 = extractelement <4 x float> %3303, i64 1
+ %3306 = extractelement <4 x float> %3303, i64 2
+ %3307 = extractelement <4 x float> %3303, i64 3
+ %3308 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3264, <4 x float> %3101, i32 0, i32 0, i32 0)
+ %3309 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3265, <4 x float> %3308, i32 0, i32 0, i32 0)
+ %3310 = extractelement <4 x float> %3309, i64 0
+ %3311 = extractelement <4 x float> %3309, i64 1
+ %3312 = extractelement <4 x float> %3309, i64 2
+ %3313 = extractelement <4 x float> %3309, i64 3
+ %3314 = load <8 x half>, ptr addrspace(3) %2810, align 16
+ %3315 = load <8 x half>, ptr addrspace(3) %2812, align 16
+ %3316 = shufflevector <8 x half> %3314, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3317 = shufflevector <8 x half> %3314, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3318 = shufflevector <8 x half> %3315, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3319 = shufflevector <8 x half> %3315, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3320 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3198, <4 x float> %3059, i32 0, i32 0, i32 0)
+ %3321 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3199, <4 x float> %3320, i32 0, i32 0, i32 0)
+ %3322 = extractelement <4 x float> %3321, i64 0
+ %3323 = extractelement <4 x float> %3321, i64 1
+ %3324 = extractelement <4 x float> %3321, i64 2
+ %3325 = extractelement <4 x float> %3321, i64 3
+ %3326 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3198, <4 x float> %3061, i32 0, i32 0, i32 0)
+ %3327 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3199, <4 x float> %3326, i32 0, i32 0, i32 0)
+ %3328 = extractelement <4 x float> %3327, i64 0
+ %3329 = extractelement <4 x float> %3327, i64 1
+ %3330 = extractelement <4 x float> %3327, i64 2
+ %3331 = extractelement <4 x float> %3327, i64 3
+ %3332 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3200, <4 x float> %3063, i32 0, i32 0, i32 0)
+ %3333 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3201, <4 x float> %3332, i32 0, i32 0, i32 0)
+ %3334 = extractelement <4 x float> %3333, i64 0
+ %3335 = extractelement <4 x float> %3333, i64 1
+ %3336 = extractelement <4 x float> %3333, i64 2
+ %3337 = extractelement <4 x float> %3333, i64 3
+ %3338 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3200, <4 x float> %3065, i32 0, i32 0, i32 0)
+ %3339 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3201, <4 x float> %3338, i32 0, i32 0, i32 0)
+ %3340 = extractelement <4 x float> %3339, i64 0
+ %3341 = extractelement <4 x float> %3339, i64 1
+ %3342 = extractelement <4 x float> %3339, i64 2
+ %3343 = extractelement <4 x float> %3339, i64 3
+ %3344 = load <8 x half>, ptr addrspace(3) %2826, align 16
+ %3345 = load <8 x half>, ptr addrspace(3) %2828, align 16
+ %3346 = shufflevector <8 x half> %3344, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3347 = shufflevector <8 x half> %3344, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3348 = shufflevector <8 x half> %3345, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3349 = shufflevector <8 x half> %3345, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3350 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3198, <4 x float> %3073, i32 0, i32 0, i32 0)
+ %3351 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3199, <4 x float> %3350, i32 0, i32 0, i32 0)
+ %3352 = extractelement <4 x float> %3351, i64 0
+ %3353 = extractelement <4 x float> %3351, i64 1
+ %3354 = extractelement <4 x float> %3351, i64 2
+ %3355 = extractelement <4 x float> %3351, i64 3
+ %3356 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3198, <4 x float> %3075, i32 0, i32 0, i32 0)
+ %3357 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3199, <4 x float> %3356, i32 0, i32 0, i32 0)
+ %3358 = extractelement <4 x float> %3357, i64 0
+ %3359 = extractelement <4 x float> %3357, i64 1
+ %3360 = extractelement <4 x float> %3357, i64 2
+ %3361 = extractelement <4 x float> %3357, i64 3
+ %3362 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3200, <4 x float> %3077, i32 0, i32 0, i32 0)
+ %3363 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3201, <4 x float> %3362, i32 0, i32 0, i32 0)
+ %3364 = extractelement <4 x float> %3363, i64 0
+ %3365 = extractelement <4 x float> %3363, i64 1
+ %3366 = extractelement <4 x float> %3363, i64 2
+ %3367 = extractelement <4 x float> %3363, i64 3
+ %3368 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3200, <4 x float> %3079, i32 0, i32 0, i32 0)
+ %3369 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3201, <4 x float> %3368, i32 0, i32 0, i32 0)
+ %3370 = extractelement <4 x float> %3369, i64 0
+ %3371 = extractelement <4 x float> %3369, i64 1
+ %3372 = extractelement <4 x float> %3369, i64 2
+ %3373 = extractelement <4 x float> %3369, i64 3
+ %3374 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3262, <4 x float> %3103, i32 0, i32 0, i32 0)
+ %3375 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3263, <4 x float> %3374, i32 0, i32 0, i32 0)
+ %3376 = extractelement <4 x float> %3375, i64 0
+ %3377 = extractelement <4 x float> %3375, i64 1
+ %3378 = extractelement <4 x float> %3375, i64 2
+ %3379 = extractelement <4 x float> %3375, i64 3
+ %3380 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3262, <4 x float> %3105, i32 0, i32 0, i32 0)
+ %3381 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3263, <4 x float> %3380, i32 0, i32 0, i32 0)
+ %3382 = extractelement <4 x float> %3381, i64 0
+ %3383 = extractelement <4 x float> %3381, i64 1
+ %3384 = extractelement <4 x float> %3381, i64 2
+ %3385 = extractelement <4 x float> %3381, i64 3
+ %3386 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3264, <4 x float> %3107, i32 0, i32 0, i32 0)
+ %3387 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3265, <4 x float> %3386, i32 0, i32 0, i32 0)
+ %3388 = extractelement <4 x float> %3387, i64 0
+ %3389 = extractelement <4 x float> %3387, i64 1
+ %3390 = extractelement <4 x float> %3387, i64 2
+ %3391 = extractelement <4 x float> %3387, i64 3
+ %3392 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3264, <4 x float> %3109, i32 0, i32 0, i32 0)
+ %3393 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3265, <4 x float> %3392, i32 0, i32 0, i32 0)
+ %3394 = extractelement <4 x float> %3393, i64 0
+ %3395 = extractelement <4 x float> %3393, i64 1
+ %3396 = extractelement <4 x float> %3393, i64 2
+ %3397 = extractelement <4 x float> %3393, i64 3
+ %3398 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3262, <4 x float> %3111, i32 0, i32 0, i32 0)
+ %3399 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3263, <4 x float> %3398, i32 0, i32 0, i32 0)
+ %3400 = extractelement <4 x float> %3399, i64 0
+ %3401 = extractelement <4 x float> %3399, i64 1
+ %3402 = extractelement <4 x float> %3399, i64 2
+ %3403 = extractelement <4 x float> %3399, i64 3
+ %3404 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3262, <4 x float> %3113, i32 0, i32 0, i32 0)
+ %3405 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3263, <4 x float> %3404, i32 0, i32 0, i32 0)
+ %3406 = extractelement <4 x float> %3405, i64 0
+ %3407 = extractelement <4 x float> %3405, i64 1
+ %3408 = extractelement <4 x float> %3405, i64 2
+ %3409 = extractelement <4 x float> %3405, i64 3
+ %3410 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3264, <4 x float> %3115, i32 0, i32 0, i32 0)
+ %3411 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3265, <4 x float> %3410, i32 0, i32 0, i32 0)
+ %3412 = extractelement <4 x float> %3411, i64 0
+ %3413 = extractelement <4 x float> %3411, i64 1
+ %3414 = extractelement <4 x float> %3411, i64 2
+ %3415 = extractelement <4 x float> %3411, i64 3
+ %3416 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3264, <4 x float> %3117, i32 0, i32 0, i32 0)
+ %3417 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3265, <4 x float> %3416, i32 0, i32 0, i32 0)
+ %3418 = extractelement <4 x float> %3417, i64 0
+ %3419 = extractelement <4 x float> %3417, i64 1
+ %3420 = extractelement <4 x float> %3417, i64 2
+ %3421 = extractelement <4 x float> %3417, i64 3
+ %3422 = load <8 x half>, ptr addrspace(3) %2858, align 16
+ %3423 = load <8 x half>, ptr addrspace(3) %2860, align 16
+ %3424 = shufflevector <8 x half> %3422, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3425 = shufflevector <8 x half> %3422, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3426 = shufflevector <8 x half> %3423, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3427 = shufflevector <8 x half> %3423, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3428 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3424, <4 x float> %3125, i32 0, i32 0, i32 0)
+ %3429 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3425, <4 x float> %3428, i32 0, i32 0, i32 0)
+ %3430 = extractelement <4 x float> %3429, i64 0
+ %3431 = extractelement <4 x float> %3429, i64 1
+ %3432 = extractelement <4 x float> %3429, i64 2
+ %3433 = extractelement <4 x float> %3429, i64 3
+ %3434 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3424, <4 x float> %3127, i32 0, i32 0, i32 0)
+ %3435 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3425, <4 x float> %3434, i32 0, i32 0, i32 0)
+ %3436 = extractelement <4 x float> %3435, i64 0
+ %3437 = extractelement <4 x float> %3435, i64 1
+ %3438 = extractelement <4 x float> %3435, i64 2
+ %3439 = extractelement <4 x float> %3435, i64 3
+ %3440 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3426, <4 x float> %3129, i32 0, i32 0, i32 0)
+ %3441 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3427, <4 x float> %3440, i32 0, i32 0, i32 0)
+ %3442 = extractelement <4 x float> %3441, i64 0
+ %3443 = extractelement <4 x float> %3441, i64 1
+ %3444 = extractelement <4 x float> %3441, i64 2
+ %3445 = extractelement <4 x float> %3441, i64 3
+ %3446 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3426, <4 x float> %3131, i32 0, i32 0, i32 0)
+ %3447 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3427, <4 x float> %3446, i32 0, i32 0, i32 0)
+ %3448 = extractelement <4 x float> %3447, i64 0
+ %3449 = extractelement <4 x float> %3447, i64 1
+ %3450 = extractelement <4 x float> %3447, i64 2
+ %3451 = extractelement <4 x float> %3447, i64 3
+ %3452 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3424, <4 x float> %3133, i32 0, i32 0, i32 0)
+ %3453 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3425, <4 x float> %3452, i32 0, i32 0, i32 0)
+ %3454 = extractelement <4 x float> %3453, i64 0
+ %3455 = extractelement <4 x float> %3453, i64 1
+ %3456 = extractelement <4 x float> %3453, i64 2
+ %3457 = extractelement <4 x float> %3453, i64 3
+ %3458 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3424, <4 x float> %3135, i32 0, i32 0, i32 0)
+ %3459 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3425, <4 x float> %3458, i32 0, i32 0, i32 0)
+ %3460 = extractelement <4 x float> %3459, i64 0
+ %3461 = extractelement <4 x float> %3459, i64 1
+ %3462 = extractelement <4 x float> %3459, i64 2
+ %3463 = extractelement <4 x float> %3459, i64 3
+ %3464 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3426, <4 x float> %3137, i32 0, i32 0, i32 0)
+ %3465 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3427, <4 x float> %3464, i32 0, i32 0, i32 0)
+ %3466 = extractelement <4 x float> %3465, i64 0
+ %3467 = extractelement <4 x float> %3465, i64 1
+ %3468 = extractelement <4 x float> %3465, i64 2
+ %3469 = extractelement <4 x float> %3465, i64 3
+ %3470 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3426, <4 x float> %3139, i32 0, i32 0, i32 0)
+ %3471 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3427, <4 x float> %3470, i32 0, i32 0, i32 0)
+ %3472 = extractelement <4 x float> %3471, i64 0
+ %3473 = extractelement <4 x float> %3471, i64 1
+ %3474 = extractelement <4 x float> %3471, i64 2
+ %3475 = extractelement <4 x float> %3471, i64 3
+ %3476 = load <8 x half>, ptr addrspace(3) %2882, align 16
+ %3477 = load <8 x half>, ptr addrspace(3) %2884, align 16
+ %3478 = shufflevector <8 x half> %3476, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3479 = shufflevector <8 x half> %3476, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3480 = shufflevector <8 x half> %3477, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3481 = shufflevector <8 x half> %3477, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %3482 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3478, <4 x float> %3163, i32 0, i32 0, i32 0)
+ %3483 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3479, <4 x float> %3482, i32 0, i32 0, i32 0)
+ %3484 = extractelement <4 x float> %3483, i64 0
+ %3485 = extractelement <4 x float> %3483, i64 1
+ %3486 = extractelement <4 x float> %3483, i64 2
+ %3487 = extractelement <4 x float> %3483, i64 3
+ %3488 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3478, <4 x float> %3165, i32 0, i32 0, i32 0)
+ %3489 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3479, <4 x float> %3488, i32 0, i32 0, i32 0)
+ %3490 = extractelement <4 x float> %3489, i64 0
+ %3491 = extractelement <4 x float> %3489, i64 1
+ %3492 = extractelement <4 x float> %3489, i64 2
+ %3493 = extractelement <4 x float> %3489, i64 3
+ %3494 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3202, <4 x half> %3480, <4 x float> %3167, i32 0, i32 0, i32 0)
+ %3495 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3203, <4 x half> %3481, <4 x float> %3494, i32 0, i32 0, i32 0)
+ %3496 = extractelement <4 x float> %3495, i64 0
+ %3497 = extractelement <4 x float> %3495, i64 1
+ %3498 = extractelement <4 x float> %3495, i64 2
+ %3499 = extractelement <4 x float> %3495, i64 3
+ %3500 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3204, <4 x half> %3480, <4 x float> %3169, i32 0, i32 0, i32 0)
+ %3501 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3205, <4 x half> %3481, <4 x float> %3500, i32 0, i32 0, i32 0)
+ %3502 = extractelement <4 x float> %3501, i64 0
+ %3503 = extractelement <4 x float> %3501, i64 1
+ %3504 = extractelement <4 x float> %3501, i64 2
+ %3505 = extractelement <4 x float> %3501, i64 3
+ %3506 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3478, <4 x float> %3171, i32 0, i32 0, i32 0)
+ %3507 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3479, <4 x float> %3506, i32 0, i32 0, i32 0)
+ %3508 = extractelement <4 x float> %3507, i64 0
+ %3509 = extractelement <4 x float> %3507, i64 1
+ %3510 = extractelement <4 x float> %3507, i64 2
+ %3511 = extractelement <4 x float> %3507, i64 3
+ %3512 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3478, <4 x float> %3173, i32 0, i32 0, i32 0)
+ %3513 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3479, <4 x float> %3512, i32 0, i32 0, i32 0)
+ %3514 = extractelement <4 x float> %3513, i64 0
+ %3515 = extractelement <4 x float> %3513, i64 1
+ %3516 = extractelement <4 x float> %3513, i64 2
+ %3517 = extractelement <4 x float> %3513, i64 3
+ %3518 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3232, <4 x half> %3480, <4 x float> %3175, i32 0, i32 0, i32 0)
+ %3519 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3233, <4 x half> %3481, <4 x float> %3518, i32 0, i32 0, i32 0)
+ %3520 = extractelement <4 x float> %3519, i64 0
+ %3521 = extractelement <4 x float> %3519, i64 1
+ %3522 = extractelement <4 x float> %3519, i64 2
+ %3523 = extractelement <4 x float> %3519, i64 3
+ %3524 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3234, <4 x half> %3480, <4 x float> %3177, i32 0, i32 0, i32 0)
+ %3525 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3235, <4 x half> %3481, <4 x float> %3524, i32 0, i32 0, i32 0)
+ %3526 = extractelement <4 x float> %3525, i64 0
+ %3527 = extractelement <4 x float> %3525, i64 1
+ %3528 = extractelement <4 x float> %3525, i64 2
+ %3529 = extractelement <4 x float> %3525, i64 3
+ %3530 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3424, <4 x float> %3141, i32 0, i32 0, i32 0)
+ %3531 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3425, <4 x float> %3530, i32 0, i32 0, i32 0)
+ %3532 = extractelement <4 x float> %3531, i64 0
+ %3533 = extractelement <4 x float> %3531, i64 1
+ %3534 = extractelement <4 x float> %3531, i64 2
+ %3535 = extractelement <4 x float> %3531, i64 3
+ %3536 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3424, <4 x float> %3143, i32 0, i32 0, i32 0)
+ %3537 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3425, <4 x float> %3536, i32 0, i32 0, i32 0)
+ %3538 = extractelement <4 x float> %3537, i64 0
+ %3539 = extractelement <4 x float> %3537, i64 1
+ %3540 = extractelement <4 x float> %3537, i64 2
+ %3541 = extractelement <4 x float> %3537, i64 3
+ %3542 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3426, <4 x float> %3145, i32 0, i32 0, i32 0)
+ %3543 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3427, <4 x float> %3542, i32 0, i32 0, i32 0)
+ %3544 = extractelement <4 x float> %3543, i64 0
+ %3545 = extractelement <4 x float> %3543, i64 1
+ %3546 = extractelement <4 x float> %3543, i64 2
+ %3547 = extractelement <4 x float> %3543, i64 3
+ %3548 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3426, <4 x float> %3147, i32 0, i32 0, i32 0)
+ %3549 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3427, <4 x float> %3548, i32 0, i32 0, i32 0)
+ %3550 = extractelement <4 x float> %3549, i64 0
+ %3551 = extractelement <4 x float> %3549, i64 1
+ %3552 = extractelement <4 x float> %3549, i64 2
+ %3553 = extractelement <4 x float> %3549, i64 3
+ %3554 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3424, <4 x float> %3149, i32 0, i32 0, i32 0)
+ %3555 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3425, <4 x float> %3554, i32 0, i32 0, i32 0)
+ %3556 = extractelement <4 x float> %3555, i64 0
+ %3557 = extractelement <4 x float> %3555, i64 1
+ %3558 = extractelement <4 x float> %3555, i64 2
+ %3559 = extractelement <4 x float> %3555, i64 3
+ %3560 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3424, <4 x float> %3151, i32 0, i32 0, i32 0)
+ %3561 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3425, <4 x float> %3560, i32 0, i32 0, i32 0)
+ %3562 = extractelement <4 x float> %3561, i64 0
+ %3563 = extractelement <4 x float> %3561, i64 1
+ %3564 = extractelement <4 x float> %3561, i64 2
+ %3565 = extractelement <4 x float> %3561, i64 3
+ %3566 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3426, <4 x float> %3153, i32 0, i32 0, i32 0)
+ %3567 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3427, <4 x float> %3566, i32 0, i32 0, i32 0)
+ %3568 = extractelement <4 x float> %3567, i64 0
+ %3569 = extractelement <4 x float> %3567, i64 1
+ %3570 = extractelement <4 x float> %3567, i64 2
+ %3571 = extractelement <4 x float> %3567, i64 3
+ %3572 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3426, <4 x float> %3155, i32 0, i32 0, i32 0)
+ %3573 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3427, <4 x float> %3572, i32 0, i32 0, i32 0)
+ %3574 = extractelement <4 x float> %3573, i64 0
+ %3575 = extractelement <4 x float> %3573, i64 1
+ %3576 = extractelement <4 x float> %3573, i64 2
+ %3577 = extractelement <4 x float> %3573, i64 3
+ %3578 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3478, <4 x float> %3179, i32 0, i32 0, i32 0)
+ %3579 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3479, <4 x float> %3578, i32 0, i32 0, i32 0)
+ %3580 = extractelement <4 x float> %3579, i64 0
+ %3581 = extractelement <4 x float> %3579, i64 1
+ %3582 = extractelement <4 x float> %3579, i64 2
+ %3583 = extractelement <4 x float> %3579, i64 3
+ %3584 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3478, <4 x float> %3181, i32 0, i32 0, i32 0)
+ %3585 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3479, <4 x float> %3584, i32 0, i32 0, i32 0)
+ %3586 = extractelement <4 x float> %3585, i64 0
+ %3587 = extractelement <4 x float> %3585, i64 1
+ %3588 = extractelement <4 x float> %3585, i64 2
+ %3589 = extractelement <4 x float> %3585, i64 3
+ %3590 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3316, <4 x half> %3480, <4 x float> %3183, i32 0, i32 0, i32 0)
+ %3591 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3317, <4 x half> %3481, <4 x float> %3590, i32 0, i32 0, i32 0)
+ %3592 = extractelement <4 x float> %3591, i64 0
+ %3593 = extractelement <4 x float> %3591, i64 1
+ %3594 = extractelement <4 x float> %3591, i64 2
+ %3595 = extractelement <4 x float> %3591, i64 3
+ %3596 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3318, <4 x half> %3480, <4 x float> %3185, i32 0, i32 0, i32 0)
+ %3597 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3319, <4 x half> %3481, <4 x float> %3596, i32 0, i32 0, i32 0)
+ %3598 = extractelement <4 x float> %3597, i64 0
+ %3599 = extractelement <4 x float> %3597, i64 1
+ %3600 = extractelement <4 x float> %3597, i64 2
+ %3601 = extractelement <4 x float> %3597, i64 3
+ %3602 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3478, <4 x float> %3187, i32 0, i32 0, i32 0)
+ %3603 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3479, <4 x float> %3602, i32 0, i32 0, i32 0)
+ %3604 = extractelement <4 x float> %3603, i64 0
+ %3605 = extractelement <4 x float> %3603, i64 1
+ %3606 = extractelement <4 x float> %3603, i64 2
+ %3607 = extractelement <4 x float> %3603, i64 3
+ %3608 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3478, <4 x float> %3189, i32 0, i32 0, i32 0)
+ %3609 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3479, <4 x float> %3608, i32 0, i32 0, i32 0)
+ %3610 = extractelement <4 x float> %3609, i64 0
+ %3611 = extractelement <4 x float> %3609, i64 1
+ %3612 = extractelement <4 x float> %3609, i64 2
+ %3613 = extractelement <4 x float> %3609, i64 3
+ %3614 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3346, <4 x half> %3480, <4 x float> %3191, i32 0, i32 0, i32 0)
+ %3615 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3347, <4 x half> %3481, <4 x float> %3614, i32 0, i32 0, i32 0)
+ %3616 = extractelement <4 x float> %3615, i64 0
+ %3617 = extractelement <4 x float> %3615, i64 1
+ %3618 = extractelement <4 x float> %3615, i64 2
+ %3619 = extractelement <4 x float> %3615, i64 3
+ %3620 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3348, <4 x half> %3480, <4 x float> %3193, i32 0, i32 0, i32 0)
+ %3621 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %3349, <4 x half> %3481, <4 x float> %3620, i32 0, i32 0, i32 0)
+ %3622 = extractelement <4 x float> %3621, i64 0
+ %3623 = extractelement <4 x float> %3621, i64 1
+ %3624 = extractelement <4 x float> %3621, i64 2
+ %3625 = extractelement <4 x float> %3621, i64 3
+ %3626 = mul i32 %31, %9
+ %3627 = sext i32 %3626 to i64
+ %3628 = getelementptr half, ptr addrspace(1) %2, i64 %3627
+ %3629 = sext i32 %118 to i64
+ %3630 = getelementptr half, ptr addrspace(1) %3628, i64 %3629
+ %3631 = mul i32 %9, %225
+ %3632 = mul i32 %9, %2309
+ %3633 = mul i32 %9, %2308
+ %3634 = mul i32 %9, %2307
+ %3635 = mul i32 %9, %2306
+ %3636 = mul i32 %9, %2305
+ %3637 = mul i32 %9, %2304
+ %3638 = mul i32 %9, %2303
+ %3639 = add i32 %3631, %2295
+ %3640 = add i32 %3631, %2302
+ %3641 = add i32 %3632, %2295
+ %3642 = add i32 %3632, %2302
+ %3643 = fptrunc float %3208 to half
+ %3644 = fptrunc float %3209 to half
+ %3645 = fptrunc float %3210 to half
+ %3646 = fptrunc float %3211 to half
+ %3647 = fptrunc float %3214 to half
+ %3648 = fptrunc float %3215 to half
+ %3649 = fptrunc float %3216 to half
+ %3650 = fptrunc float %3217 to half
+ %3651 = fptrunc float %3220 to half
+ %3652 = fptrunc float %3221 to half
+ %3653 = fptrunc float %3222 to half
+ %3654 = fptrunc float %3223 to half
+ %3655 = fptrunc float %3226 to half
+ %3656 = fptrunc float %3227 to half
+ %3657 = fptrunc float %3228 to half
+ %3658 = fptrunc float %3229 to half
+ %3659 = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %3630, i16 0, i32 2147483646, i32 159744)
+ %3660 = insertelement <4 x half> poison, half %3643, i64 0
+ %3661 = insertelement <4 x half> %3660, half %3644, i64 1
+ %3662 = insertelement <4 x half> %3661, half %3645, i64 2
+ %3663 = insertelement <4 x half> %3662, half %3646, i64 3
+ %3664 = bitcast <4 x half> %3663 to <2 x i32>
+ %3665 = shl i32 %3639, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3664, ptr addrspace(8) %3659, i32 %3665, i32 0, i32 0)
+ %3666 = insertelement <4 x half> poison, half %3647, i64 0
+ %3667 = insertelement <4 x half> %3666, half %3648, i64 1
+ %3668 = insertelement <4 x half> %3667, half %3649, i64 2
+ %3669 = insertelement <4 x half> %3668, half %3650, i64 3
+ %3670 = bitcast <4 x half> %3669 to <2 x i32>
+ %3671 = shl i32 %3640, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3670, ptr addrspace(8) %3659, i32 %3671, i32 0, i32 0)
+ %3672 = insertelement <4 x half> poison, half %3651, i64 0
+ %3673 = insertelement <4 x half> %3672, half %3652, i64 1
+ %3674 = insertelement <4 x half> %3673, half %3653, i64 2
+ %3675 = insertelement <4 x half> %3674, half %3654, i64 3
+ %3676 = bitcast <4 x half> %3675 to <2 x i32>
+ %3677 = shl i32 %3641, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3676, ptr addrspace(8) %3659, i32 %3677, i32 0, i32 0)
+ %3678 = insertelement <4 x half> poison, half %3655, i64 0
+ %3679 = insertelement <4 x half> %3678, half %3656, i64 1
+ %3680 = insertelement <4 x half> %3679, half %3657, i64 2
+ %3681 = insertelement <4 x half> %3680, half %3658, i64 3
+ %3682 = bitcast <4 x half> %3681 to <2 x i32>
+ %3683 = shl i32 %3642, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3682, ptr addrspace(8) %3659, i32 %3683, i32 0, i32 0)
+ %3684 = add i32 %3631, %2301
+ %3685 = add i32 %3631, %2300
+ %3686 = add i32 %3632, %2301
+ %3687 = add i32 %3632, %2300
+ %3688 = fptrunc float %3238 to half
+ %3689 = fptrunc float %3239 to half
+ %3690 = fptrunc float %3240 to half
+ %3691 = fptrunc float %3241 to half
+ %3692 = fptrunc float %3244 to half
+ %3693 = fptrunc float %3245 to half
+ %3694 = fptrunc float %3246 to half
+ %3695 = fptrunc float %3247 to half
+ %3696 = fptrunc float %3250 to half
+ %3697 = fptrunc float %3251 to half
+ %3698 = fptrunc float %3252 to half
+ %3699 = fptrunc float %3253 to half
+ %3700 = fptrunc float %3256 to half
+ %3701 = fptrunc float %3257 to half
+ %3702 = fptrunc float %3258 to half
+ %3703 = fptrunc float %3259 to half
+ %3704 = insertelement <4 x half> poison, half %3688, i64 0
+ %3705 = insertelement <4 x half> %3704, half %3689, i64 1
+ %3706 = insertelement <4 x half> %3705, half %3690, i64 2
+ %3707 = insertelement <4 x half> %3706, half %3691, i64 3
+ %3708 = bitcast <4 x half> %3707 to <2 x i32>
+ %3709 = shl i32 %3684, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3708, ptr addrspace(8) %3659, i32 %3709, i32 0, i32 0)
+ %3710 = insertelement <4 x half> poison, half %3692, i64 0
+ %3711 = insertelement <4 x half> %3710, half %3693, i64 1
+ %3712 = insertelement <4 x half> %3711, half %3694, i64 2
+ %3713 = insertelement <4 x half> %3712, half %3695, i64 3
+ %3714 = bitcast <4 x half> %3713 to <2 x i32>
+ %3715 = shl i32 %3685, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3714, ptr addrspace(8) %3659, i32 %3715, i32 0, i32 0)
+ %3716 = insertelement <4 x half> poison, half %3696, i64 0
+ %3717 = insertelement <4 x half> %3716, half %3697, i64 1
+ %3718 = insertelement <4 x half> %3717, half %3698, i64 2
+ %3719 = insertelement <4 x half> %3718, half %3699, i64 3
+ %3720 = bitcast <4 x half> %3719 to <2 x i32>
+ %3721 = shl i32 %3686, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3720, ptr addrspace(8) %3659, i32 %3721, i32 0, i32 0)
+ %3722 = insertelement <4 x half> poison, half %3700, i64 0
+ %3723 = insertelement <4 x half> %3722, half %3701, i64 1
+ %3724 = insertelement <4 x half> %3723, half %3702, i64 2
+ %3725 = insertelement <4 x half> %3724, half %3703, i64 3
+ %3726 = bitcast <4 x half> %3725 to <2 x i32>
+ %3727 = shl i32 %3687, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3726, ptr addrspace(8) %3659, i32 %3727, i32 0, i32 0)
+ %3728 = add i32 %3631, %2299
+ %3729 = add i32 %3631, %2298
+ %3730 = add i32 %3632, %2299
+ %3731 = add i32 %3632, %2298
+ %3732 = fptrunc float %3322 to half
+ %3733 = fptrunc float %3323 to half
+ %3734 = fptrunc float %3324 to half
+ %3735 = fptrunc float %3325 to half
+ %3736 = fptrunc float %3328 to half
+ %3737 = fptrunc float %3329 to half
+ %3738 = fptrunc float %3330 to half
+ %3739 = fptrunc float %3331 to half
+ %3740 = fptrunc float %3334 to half
+ %3741 = fptrunc float %3335 to half
+ %3742 = fptrunc float %3336 to half
+ %3743 = fptrunc float %3337 to half
+ %3744 = fptrunc float %3340 to half
+ %3745 = fptrunc float %3341 to half
+ %3746 = fptrunc float %3342 to half
+ %3747 = fptrunc float %3343 to half
+ %3748 = insertelement <4 x half> poison, half %3732, i64 0
+ %3749 = insertelement <4 x half> %3748, half %3733, i64 1
+ %3750 = insertelement <4 x half> %3749, half %3734, i64 2
+ %3751 = insertelement <4 x half> %3750, half %3735, i64 3
+ %3752 = bitcast <4 x half> %3751 to <2 x i32>
+ %3753 = shl i32 %3728, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3752, ptr addrspace(8) %3659, i32 %3753, i32 0, i32 0)
+ %3754 = insertelement <4 x half> poison, half %3736, i64 0
+ %3755 = insertelement <4 x half> %3754, half %3737, i64 1
+ %3756 = insertelement <4 x half> %3755, half %3738, i64 2
+ %3757 = insertelement <4 x half> %3756, half %3739, i64 3
+ %3758 = bitcast <4 x half> %3757 to <2 x i32>
+ %3759 = shl i32 %3729, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3758, ptr addrspace(8) %3659, i32 %3759, i32 0, i32 0)
+ %3760 = insertelement <4 x half> poison, half %3740, i64 0
+ %3761 = insertelement <4 x half> %3760, half %3741, i64 1
+ %3762 = insertelement <4 x half> %3761, half %3742, i64 2
+ %3763 = insertelement <4 x half> %3762, half %3743, i64 3
+ %3764 = bitcast <4 x half> %3763 to <2 x i32>
+ %3765 = shl i32 %3730, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3764, ptr addrspace(8) %3659, i32 %3765, i32 0, i32 0)
+ %3766 = insertelement <4 x half> poison, half %3744, i64 0
+ %3767 = insertelement <4 x half> %3766, half %3745, i64 1
+ %3768 = insertelement <4 x half> %3767, half %3746, i64 2
+ %3769 = insertelement <4 x half> %3768, half %3747, i64 3
+ %3770 = bitcast <4 x half> %3769 to <2 x i32>
+ %3771 = shl i32 %3731, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3770, ptr addrspace(8) %3659, i32 %3771, i32 0, i32 0)
+ %3772 = add i32 %3631, %2297
+ %3773 = add i32 %3631, %2296
+ %3774 = add i32 %3632, %2297
+ %3775 = add i32 %3632, %2296
+ %3776 = fptrunc float %3352 to half
+ %3777 = fptrunc float %3353 to half
+ %3778 = fptrunc float %3354 to half
+ %3779 = fptrunc float %3355 to half
+ %3780 = fptrunc float %3358 to half
+ %3781 = fptrunc float %3359 to half
+ %3782 = fptrunc float %3360 to half
+ %3783 = fptrunc float %3361 to half
+ %3784 = fptrunc float %3364 to half
+ %3785 = fptrunc float %3365 to half
+ %3786 = fptrunc float %3366 to half
+ %3787 = fptrunc float %3367 to half
+ %3788 = fptrunc float %3370 to half
+ %3789 = fptrunc float %3371 to half
+ %3790 = fptrunc float %3372 to half
+ %3791 = fptrunc float %3373 to half
+ %3792 = insertelement <4 x half> poison, half %3776, i64 0
+ %3793 = insertelement <4 x half> %3792, half %3777, i64 1
+ %3794 = insertelement <4 x half> %3793, half %3778, i64 2
+ %3795 = insertelement <4 x half> %3794, half %3779, i64 3
+ %3796 = bitcast <4 x half> %3795 to <2 x i32>
+ %3797 = shl i32 %3772, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3796, ptr addrspace(8) %3659, i32 %3797, i32 0, i32 0)
+ %3798 = insertelement <4 x half> poison, half %3780, i64 0
+ %3799 = insertelement <4 x half> %3798, half %3781, i64 1
+ %3800 = insertelement <4 x half> %3799, half %3782, i64 2
+ %3801 = insertelement <4 x half> %3800, half %3783, i64 3
+ %3802 = bitcast <4 x half> %3801 to <2 x i32>
+ %3803 = shl i32 %3773, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3802, ptr addrspace(8) %3659, i32 %3803, i32 0, i32 0)
+ %3804 = insertelement <4 x half> poison, half %3784, i64 0
+ %3805 = insertelement <4 x half> %3804, half %3785, i64 1
+ %3806 = insertelement <4 x half> %3805, half %3786, i64 2
+ %3807 = insertelement <4 x half> %3806, half %3787, i64 3
+ %3808 = bitcast <4 x half> %3807 to <2 x i32>
+ %3809 = shl i32 %3774, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3808, ptr addrspace(8) %3659, i32 %3809, i32 0, i32 0)
+ %3810 = insertelement <4 x half> poison, half %3788, i64 0
+ %3811 = insertelement <4 x half> %3810, half %3789, i64 1
+ %3812 = insertelement <4 x half> %3811, half %3790, i64 2
+ %3813 = insertelement <4 x half> %3812, half %3791, i64 3
+ %3814 = bitcast <4 x half> %3813 to <2 x i32>
+ %3815 = shl i32 %3775, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3814, ptr addrspace(8) %3659, i32 %3815, i32 0, i32 0)
+ %3816 = add i32 %3633, %2295
+ %3817 = add i32 %3633, %2302
+ %3818 = add i32 %3634, %2295
+ %3819 = add i32 %3634, %2302
+ %3820 = fptrunc float %3268 to half
+ %3821 = fptrunc float %3269 to half
+ %3822 = fptrunc float %3270 to half
+ %3823 = fptrunc float %3271 to half
+ %3824 = fptrunc float %3274 to half
+ %3825 = fptrunc float %3275 to half
+ %3826 = fptrunc float %3276 to half
+ %3827 = fptrunc float %3277 to half
+ %3828 = fptrunc float %3280 to half
+ %3829 = fptrunc float %3281 to half
+ %3830 = fptrunc float %3282 to half
+ %3831 = fptrunc float %3283 to half
+ %3832 = fptrunc float %3286 to half
+ %3833 = fptrunc float %3287 to half
+ %3834 = fptrunc float %3288 to half
+ %3835 = fptrunc float %3289 to half
+ %3836 = insertelement <4 x half> poison, half %3820, i64 0
+ %3837 = insertelement <4 x half> %3836, half %3821, i64 1
+ %3838 = insertelement <4 x half> %3837, half %3822, i64 2
+ %3839 = insertelement <4 x half> %3838, half %3823, i64 3
+ %3840 = bitcast <4 x half> %3839 to <2 x i32>
+ %3841 = shl i32 %3816, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3840, ptr addrspace(8) %3659, i32 %3841, i32 0, i32 0)
+ %3842 = insertelement <4 x half> poison, half %3824, i64 0
+ %3843 = insertelement <4 x half> %3842, half %3825, i64 1
+ %3844 = insertelement <4 x half> %3843, half %3826, i64 2
+ %3845 = insertelement <4 x half> %3844, half %3827, i64 3
+ %3846 = bitcast <4 x half> %3845 to <2 x i32>
+ %3847 = shl i32 %3817, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3846, ptr addrspace(8) %3659, i32 %3847, i32 0, i32 0)
+ %3848 = insertelement <4 x half> poison, half %3828, i64 0
+ %3849 = insertelement <4 x half> %3848, half %3829, i64 1
+ %3850 = insertelement <4 x half> %3849, half %3830, i64 2
+ %3851 = insertelement <4 x half> %3850, half %3831, i64 3
+ %3852 = bitcast <4 x half> %3851 to <2 x i32>
+ %3853 = shl i32 %3818, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3852, ptr addrspace(8) %3659, i32 %3853, i32 0, i32 0)
+ %3854 = insertelement <4 x half> poison, half %3832, i64 0
+ %3855 = insertelement <4 x half> %3854, half %3833, i64 1
+ %3856 = insertelement <4 x half> %3855, half %3834, i64 2
+ %3857 = insertelement <4 x half> %3856, half %3835, i64 3
+ %3858 = bitcast <4 x half> %3857 to <2 x i32>
+ %3859 = shl i32 %3819, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3858, ptr addrspace(8) %3659, i32 %3859, i32 0, i32 0)
+ %3860 = add i32 %3633, %2301
+ %3861 = add i32 %3633, %2300
+ %3862 = add i32 %3634, %2301
+ %3863 = add i32 %3634, %2300
+ %3864 = fptrunc float %3292 to half
+ %3865 = fptrunc float %3293 to half
+ %3866 = fptrunc float %3294 to half
+ %3867 = fptrunc float %3295 to half
+ %3868 = fptrunc float %3298 to half
+ %3869 = fptrunc float %3299 to half
+ %3870 = fptrunc float %3300 to half
+ %3871 = fptrunc float %3301 to half
+ %3872 = fptrunc float %3304 to half
+ %3873 = fptrunc float %3305 to half
+ %3874 = fptrunc float %3306 to half
+ %3875 = fptrunc float %3307 to half
+ %3876 = fptrunc float %3310 to half
+ %3877 = fptrunc float %3311 to half
+ %3878 = fptrunc float %3312 to half
+ %3879 = fptrunc float %3313 to half
+ %3880 = insertelement <4 x half> poison, half %3864, i64 0
+ %3881 = insertelement <4 x half> %3880, half %3865, i64 1
+ %3882 = insertelement <4 x half> %3881, half %3866, i64 2
+ %3883 = insertelement <4 x half> %3882, half %3867, i64 3
+ %3884 = bitcast <4 x half> %3883 to <2 x i32>
+ %3885 = shl i32 %3860, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3884, ptr addrspace(8) %3659, i32 %3885, i32 0, i32 0)
+ %3886 = insertelement <4 x half> poison, half %3868, i64 0
+ %3887 = insertelement <4 x half> %3886, half %3869, i64 1
+ %3888 = insertelement <4 x half> %3887, half %3870, i64 2
+ %3889 = insertelement <4 x half> %3888, half %3871, i64 3
+ %3890 = bitcast <4 x half> %3889 to <2 x i32>
+ %3891 = shl i32 %3861, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3890, ptr addrspace(8) %3659, i32 %3891, i32 0, i32 0)
+ %3892 = insertelement <4 x half> poison, half %3872, i64 0
+ %3893 = insertelement <4 x half> %3892, half %3873, i64 1
+ %3894 = insertelement <4 x half> %3893, half %3874, i64 2
+ %3895 = insertelement <4 x half> %3894, half %3875, i64 3
+ %3896 = bitcast <4 x half> %3895 to <2 x i32>
+ %3897 = shl i32 %3862, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3896, ptr addrspace(8) %3659, i32 %3897, i32 0, i32 0)
+ %3898 = insertelement <4 x half> poison, half %3876, i64 0
+ %3899 = insertelement <4 x half> %3898, half %3877, i64 1
+ %3900 = insertelement <4 x half> %3899, half %3878, i64 2
+ %3901 = insertelement <4 x half> %3900, half %3879, i64 3
+ %3902 = bitcast <4 x half> %3901 to <2 x i32>
+ %3903 = shl i32 %3863, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3902, ptr addrspace(8) %3659, i32 %3903, i32 0, i32 0)
+ %3904 = add i32 %3633, %2299
+ %3905 = add i32 %3633, %2298
+ %3906 = add i32 %3634, %2299
+ %3907 = add i32 %3634, %2298
+ %3908 = fptrunc float %3376 to half
+ %3909 = fptrunc float %3377 to half
+ %3910 = fptrunc float %3378 to half
+ %3911 = fptrunc float %3379 to half
+ %3912 = fptrunc float %3382 to half
+ %3913 = fptrunc float %3383 to half
+ %3914 = fptrunc float %3384 to half
+ %3915 = fptrunc float %3385 to half
+ %3916 = fptrunc float %3388 to half
+ %3917 = fptrunc float %3389 to half
+ %3918 = fptrunc float %3390 to half
+ %3919 = fptrunc float %3391 to half
+ %3920 = fptrunc float %3394 to half
+ %3921 = fptrunc float %3395 to half
+ %3922 = fptrunc float %3396 to half
+ %3923 = fptrunc float %3397 to half
+ %3924 = insertelement <4 x half> poison, half %3908, i64 0
+ %3925 = insertelement <4 x half> %3924, half %3909, i64 1
+ %3926 = insertelement <4 x half> %3925, half %3910, i64 2
+ %3927 = insertelement <4 x half> %3926, half %3911, i64 3
+ %3928 = bitcast <4 x half> %3927 to <2 x i32>
+ %3929 = shl i32 %3904, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3928, ptr addrspace(8) %3659, i32 %3929, i32 0, i32 0)
+ %3930 = insertelement <4 x half> poison, half %3912, i64 0
+ %3931 = insertelement <4 x half> %3930, half %3913, i64 1
+ %3932 = insertelement <4 x half> %3931, half %3914, i64 2
+ %3933 = insertelement <4 x half> %3932, half %3915, i64 3
+ %3934 = bitcast <4 x half> %3933 to <2 x i32>
+ %3935 = shl i32 %3905, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3934, ptr addrspace(8) %3659, i32 %3935, i32 0, i32 0)
+ %3936 = insertelement <4 x half> poison, half %3916, i64 0
+ %3937 = insertelement <4 x half> %3936, half %3917, i64 1
+ %3938 = insertelement <4 x half> %3937, half %3918, i64 2
+ %3939 = insertelement <4 x half> %3938, half %3919, i64 3
+ %3940 = bitcast <4 x half> %3939 to <2 x i32>
+ %3941 = shl i32 %3906, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3940, ptr addrspace(8) %3659, i32 %3941, i32 0, i32 0)
+ %3942 = insertelement <4 x half> poison, half %3920, i64 0
+ %3943 = insertelement <4 x half> %3942, half %3921, i64 1
+ %3944 = insertelement <4 x half> %3943, half %3922, i64 2
+ %3945 = insertelement <4 x half> %3944, half %3923, i64 3
+ %3946 = bitcast <4 x half> %3945 to <2 x i32>
+ %3947 = shl i32 %3907, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3946, ptr addrspace(8) %3659, i32 %3947, i32 0, i32 0)
+ %3948 = add i32 %3633, %2297
+ %3949 = add i32 %3633, %2296
+ %3950 = add i32 %3634, %2297
+ %3951 = add i32 %3634, %2296
+ %3952 = fptrunc float %3400 to half
+ %3953 = fptrunc float %3401 to half
+ %3954 = fptrunc float %3402 to half
+ %3955 = fptrunc float %3403 to half
+ %3956 = fptrunc float %3406 to half
+ %3957 = fptrunc float %3407 to half
+ %3958 = fptrunc float %3408 to half
+ %3959 = fptrunc float %3409 to half
+ %3960 = fptrunc float %3412 to half
+ %3961 = fptrunc float %3413 to half
+ %3962 = fptrunc float %3414 to half
+ %3963 = fptrunc float %3415 to half
+ %3964 = fptrunc float %3418 to half
+ %3965 = fptrunc float %3419 to half
+ %3966 = fptrunc float %3420 to half
+ %3967 = fptrunc float %3421 to half
+ %3968 = insertelement <4 x half> poison, half %3952, i64 0
+ %3969 = insertelement <4 x half> %3968, half %3953, i64 1
+ %3970 = insertelement <4 x half> %3969, half %3954, i64 2
+ %3971 = insertelement <4 x half> %3970, half %3955, i64 3
+ %3972 = bitcast <4 x half> %3971 to <2 x i32>
+ %3973 = shl i32 %3948, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3972, ptr addrspace(8) %3659, i32 %3973, i32 0, i32 0)
+ %3974 = insertelement <4 x half> poison, half %3956, i64 0
+ %3975 = insertelement <4 x half> %3974, half %3957, i64 1
+ %3976 = insertelement <4 x half> %3975, half %3958, i64 2
+ %3977 = insertelement <4 x half> %3976, half %3959, i64 3
+ %3978 = bitcast <4 x half> %3977 to <2 x i32>
+ %3979 = shl i32 %3949, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3978, ptr addrspace(8) %3659, i32 %3979, i32 0, i32 0)
+ %3980 = insertelement <4 x half> poison, half %3960, i64 0
+ %3981 = insertelement <4 x half> %3980, half %3961, i64 1
+ %3982 = insertelement <4 x half> %3981, half %3962, i64 2
+ %3983 = insertelement <4 x half> %3982, half %3963, i64 3
+ %3984 = bitcast <4 x half> %3983 to <2 x i32>
+ %3985 = shl i32 %3950, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3984, ptr addrspace(8) %3659, i32 %3985, i32 0, i32 0)
+ %3986 = insertelement <4 x half> poison, half %3964, i64 0
+ %3987 = insertelement <4 x half> %3986, half %3965, i64 1
+ %3988 = insertelement <4 x half> %3987, half %3966, i64 2
+ %3989 = insertelement <4 x half> %3988, half %3967, i64 3
+ %3990 = bitcast <4 x half> %3989 to <2 x i32>
+ %3991 = shl i32 %3951, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %3990, ptr addrspace(8) %3659, i32 %3991, i32 0, i32 0)
+ %3992 = add i32 %3635, %2295
+ %3993 = add i32 %3635, %2302
+ %3994 = add i32 %3636, %2295
+ %3995 = add i32 %3636, %2302
+ %3996 = fptrunc float %3430 to half
+ %3997 = fptrunc float %3431 to half
+ %3998 = fptrunc float %3432 to half
+ %3999 = fptrunc float %3433 to half
+ %4000 = fptrunc float %3436 to half
+ %4001 = fptrunc float %3437 to half
+ %4002 = fptrunc float %3438 to half
+ %4003 = fptrunc float %3439 to half
+ %4004 = fptrunc float %3442 to half
+ %4005 = fptrunc float %3443 to half
+ %4006 = fptrunc float %3444 to half
+ %4007 = fptrunc float %3445 to half
+ %4008 = fptrunc float %3448 to half
+ %4009 = fptrunc float %3449 to half
+ %4010 = fptrunc float %3450 to half
+ %4011 = fptrunc float %3451 to half
+ %4012 = insertelement <4 x half> poison, half %3996, i64 0
+ %4013 = insertelement <4 x half> %4012, half %3997, i64 1
+ %4014 = insertelement <4 x half> %4013, half %3998, i64 2
+ %4015 = insertelement <4 x half> %4014, half %3999, i64 3
+ %4016 = bitcast <4 x half> %4015 to <2 x i32>
+ %4017 = shl i32 %3992, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4016, ptr addrspace(8) %3659, i32 %4017, i32 0, i32 0)
+ %4018 = insertelement <4 x half> poison, half %4000, i64 0
+ %4019 = insertelement <4 x half> %4018, half %4001, i64 1
+ %4020 = insertelement <4 x half> %4019, half %4002, i64 2
+ %4021 = insertelement <4 x half> %4020, half %4003, i64 3
+ %4022 = bitcast <4 x half> %4021 to <2 x i32>
+ %4023 = shl i32 %3993, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4022, ptr addrspace(8) %3659, i32 %4023, i32 0, i32 0)
+ %4024 = insertelement <4 x half> poison, half %4004, i64 0
+ %4025 = insertelement <4 x half> %4024, half %4005, i64 1
+ %4026 = insertelement <4 x half> %4025, half %4006, i64 2
+ %4027 = insertelement <4 x half> %4026, half %4007, i64 3
+ %4028 = bitcast <4 x half> %4027 to <2 x i32>
+ %4029 = shl i32 %3994, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4028, ptr addrspace(8) %3659, i32 %4029, i32 0, i32 0)
+ %4030 = insertelement <4 x half> poison, half %4008, i64 0
+ %4031 = insertelement <4 x half> %4030, half %4009, i64 1
+ %4032 = insertelement <4 x half> %4031, half %4010, i64 2
+ %4033 = insertelement <4 x half> %4032, half %4011, i64 3
+ %4034 = bitcast <4 x half> %4033 to <2 x i32>
+ %4035 = shl i32 %3995, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4034, ptr addrspace(8) %3659, i32 %4035, i32 0, i32 0)
+ %4036 = add i32 %3635, %2301
+ %4037 = add i32 %3635, %2300
+ %4038 = add i32 %3636, %2301
+ %4039 = add i32 %3636, %2300
+ %4040 = fptrunc float %3454 to half
+ %4041 = fptrunc float %3455 to half
+ %4042 = fptrunc float %3456 to half
+ %4043 = fptrunc float %3457 to half
+ %4044 = fptrunc float %3460 to half
+ %4045 = fptrunc float %3461 to half
+ %4046 = fptrunc float %3462 to half
+ %4047 = fptrunc float %3463 to half
+ %4048 = fptrunc float %3466 to half
+ %4049 = fptrunc float %3467 to half
+ %4050 = fptrunc float %3468 to half
+ %4051 = fptrunc float %3469 to half
+ %4052 = fptrunc float %3472 to half
+ %4053 = fptrunc float %3473 to half
+ %4054 = fptrunc float %3474 to half
+ %4055 = fptrunc float %3475 to half
+ %4056 = insertelement <4 x half> poison, half %4040, i64 0
+ %4057 = insertelement <4 x half> %4056, half %4041, i64 1
+ %4058 = insertelement <4 x half> %4057, half %4042, i64 2
+ %4059 = insertelement <4 x half> %4058, half %4043, i64 3
+ %4060 = bitcast <4 x half> %4059 to <2 x i32>
+ %4061 = shl i32 %4036, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4060, ptr addrspace(8) %3659, i32 %4061, i32 0, i32 0)
+ %4062 = insertelement <4 x half> poison, half %4044, i64 0
+ %4063 = insertelement <4 x half> %4062, half %4045, i64 1
+ %4064 = insertelement <4 x half> %4063, half %4046, i64 2
+ %4065 = insertelement <4 x half> %4064, half %4047, i64 3
+ %4066 = bitcast <4 x half> %4065 to <2 x i32>
+ %4067 = shl i32 %4037, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4066, ptr addrspace(8) %3659, i32 %4067, i32 0, i32 0)
+ %4068 = insertelement <4 x half> poison, half %4048, i64 0
+ %4069 = insertelement <4 x half> %4068, half %4049, i64 1
+ %4070 = insertelement <4 x half> %4069, half %4050, i64 2
+ %4071 = insertelement <4 x half> %4070, half %4051, i64 3
+ %4072 = bitcast <4 x half> %4071 to <2 x i32>
+ %4073 = shl i32 %4038, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4072, ptr addrspace(8) %3659, i32 %4073, i32 0, i32 0)
+ %4074 = insertelement <4 x half> poison, half %4052, i64 0
+ %4075 = insertelement <4 x half> %4074, half %4053, i64 1
+ %4076 = insertelement <4 x half> %4075, half %4054, i64 2
+ %4077 = insertelement <4 x half> %4076, half %4055, i64 3
+ %4078 = bitcast <4 x half> %4077 to <2 x i32>
+ %4079 = shl i32 %4039, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4078, ptr addrspace(8) %3659, i32 %4079, i32 0, i32 0)
+ %4080 = add i32 %3635, %2299
+ %4081 = add i32 %3635, %2298
+ %4082 = add i32 %3636, %2299
+ %4083 = add i32 %3636, %2298
+ %4084 = fptrunc float %3532 to half
+ %4085 = fptrunc float %3533 to half
+ %4086 = fptrunc float %3534 to half
+ %4087 = fptrunc float %3535 to half
+ %4088 = fptrunc float %3538 to half
+ %4089 = fptrunc float %3539 to half
+ %4090 = fptrunc float %3540 to half
+ %4091 = fptrunc float %3541 to half
+ %4092 = fptrunc float %3544 to half
+ %4093 = fptrunc float %3545 to half
+ %4094 = fptrunc float %3546 to half
+ %4095 = fptrunc float %3547 to half
+ %4096 = fptrunc float %3550 to half
+ %4097 = fptrunc float %3551 to half
+ %4098 = fptrunc float %3552 to half
+ %4099 = fptrunc float %3553 to half
+ %4100 = insertelement <4 x half> poison, half %4084, i64 0
+ %4101 = insertelement <4 x half> %4100, half %4085, i64 1
+ %4102 = insertelement <4 x half> %4101, half %4086, i64 2
+ %4103 = insertelement <4 x half> %4102, half %4087, i64 3
+ %4104 = bitcast <4 x half> %4103 to <2 x i32>
+ %4105 = shl i32 %4080, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4104, ptr addrspace(8) %3659, i32 %4105, i32 0, i32 0)
+ %4106 = insertelement <4 x half> poison, half %4088, i64 0
+ %4107 = insertelement <4 x half> %4106, half %4089, i64 1
+ %4108 = insertelement <4 x half> %4107, half %4090, i64 2
+ %4109 = insertelement <4 x half> %4108, half %4091, i64 3
+ %4110 = bitcast <4 x half> %4109 to <2 x i32>
+ %4111 = shl i32 %4081, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4110, ptr addrspace(8) %3659, i32 %4111, i32 0, i32 0)
+ %4112 = insertelement <4 x half> poison, half %4092, i64 0
+ %4113 = insertelement <4 x half> %4112, half %4093, i64 1
+ %4114 = insertelement <4 x half> %4113, half %4094, i64 2
+ %4115 = insertelement <4 x half> %4114, half %4095, i64 3
+ %4116 = bitcast <4 x half> %4115 to <2 x i32>
+ %4117 = shl i32 %4082, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4116, ptr addrspace(8) %3659, i32 %4117, i32 0, i32 0)
+ %4118 = insertelement <4 x half> poison, half %4096, i64 0
+ %4119 = insertelement <4 x half> %4118, half %4097, i64 1
+ %4120 = insertelement <4 x half> %4119, half %4098, i64 2
+ %4121 = insertelement <4 x half> %4120, half %4099, i64 3
+ %4122 = bitcast <4 x half> %4121 to <2 x i32>
+ %4123 = shl i32 %4083, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4122, ptr addrspace(8) %3659, i32 %4123, i32 0, i32 0)
+ %4124 = add i32 %3635, %2297
+ %4125 = add i32 %3635, %2296
+ %4126 = add i32 %3636, %2297
+ %4127 = add i32 %3636, %2296
+ %4128 = fptrunc float %3556 to half
+ %4129 = fptrunc float %3557 to half
+ %4130 = fptrunc float %3558 to half
+ %4131 = fptrunc float %3559 to half
+ %4132 = fptrunc float %3562 to half
+ %4133 = fptrunc float %3563 to half
+ %4134 = fptrunc float %3564 to half
+ %4135 = fptrunc float %3565 to half
+ %4136 = fptrunc float %3568 to half
+ %4137 = fptrunc float %3569 to half
+ %4138 = fptrunc float %3570 to half
+ %4139 = fptrunc float %3571 to half
+ %4140 = fptrunc float %3574 to half
+ %4141 = fptrunc float %3575 to half
+ %4142 = fptrunc float %3576 to half
+ %4143 = fptrunc float %3577 to half
+ %4144 = insertelement <4 x half> poison, half %4128, i64 0
+ %4145 = insertelement <4 x half> %4144, half %4129, i64 1
+ %4146 = insertelement <4 x half> %4145, half %4130, i64 2
+ %4147 = insertelement <4 x half> %4146, half %4131, i64 3
+ %4148 = bitcast <4 x half> %4147 to <2 x i32>
+ %4149 = shl i32 %4124, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4148, ptr addrspace(8) %3659, i32 %4149, i32 0, i32 0)
+ %4150 = insertelement <4 x half> poison, half %4132, i64 0
+ %4151 = insertelement <4 x half> %4150, half %4133, i64 1
+ %4152 = insertelement <4 x half> %4151, half %4134, i64 2
+ %4153 = insertelement <4 x half> %4152, half %4135, i64 3
+ %4154 = bitcast <4 x half> %4153 to <2 x i32>
+ %4155 = shl i32 %4125, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4154, ptr addrspace(8) %3659, i32 %4155, i32 0, i32 0)
+ %4156 = insertelement <4 x half> poison, half %4136, i64 0
+ %4157 = insertelement <4 x half> %4156, half %4137, i64 1
+ %4158 = insertelement <4 x half> %4157, half %4138, i64 2
+ %4159 = insertelement <4 x half> %4158, half %4139, i64 3
+ %4160 = bitcast <4 x half> %4159 to <2 x i32>
+ %4161 = shl i32 %4126, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4160, ptr addrspace(8) %3659, i32 %4161, i32 0, i32 0)
+ %4162 = insertelement <4 x half> poison, half %4140, i64 0
+ %4163 = insertelement <4 x half> %4162, half %4141, i64 1
+ %4164 = insertelement <4 x half> %4163, half %4142, i64 2
+ %4165 = insertelement <4 x half> %4164, half %4143, i64 3
+ %4166 = bitcast <4 x half> %4165 to <2 x i32>
+ %4167 = shl i32 %4127, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4166, ptr addrspace(8) %3659, i32 %4167, i32 0, i32 0)
+ %4168 = add i32 %3637, %2295
+ %4169 = add i32 %3637, %2302
+ %4170 = add i32 %3638, %2295
+ %4171 = add i32 %3638, %2302
+ %4172 = fptrunc float %3484 to half
+ %4173 = fptrunc float %3485 to half
+ %4174 = fptrunc float %3486 to half
+ %4175 = fptrunc float %3487 to half
+ %4176 = fptrunc float %3490 to half
+ %4177 = fptrunc float %3491 to half
+ %4178 = fptrunc float %3492 to half
+ %4179 = fptrunc float %3493 to half
+ %4180 = fptrunc float %3496 to half
+ %4181 = fptrunc float %3497 to half
+ %4182 = fptrunc float %3498 to half
+ %4183 = fptrunc float %3499 to half
+ %4184 = fptrunc float %3502 to half
+ %4185 = fptrunc float %3503 to half
+ %4186 = fptrunc float %3504 to half
+ %4187 = fptrunc float %3505 to half
+ %4188 = insertelement <4 x half> poison, half %4172, i64 0
+ %4189 = insertelement <4 x half> %4188, half %4173, i64 1
+ %4190 = insertelement <4 x half> %4189, half %4174, i64 2
+ %4191 = insertelement <4 x half> %4190, half %4175, i64 3
+ %4192 = bitcast <4 x half> %4191 to <2 x i32>
+ %4193 = shl i32 %4168, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4192, ptr addrspace(8) %3659, i32 %4193, i32 0, i32 0)
+ %4194 = insertelement <4 x half> poison, half %4176, i64 0
+ %4195 = insertelement <4 x half> %4194, half %4177, i64 1
+ %4196 = insertelement <4 x half> %4195, half %4178, i64 2
+ %4197 = insertelement <4 x half> %4196, half %4179, i64 3
+ %4198 = bitcast <4 x half> %4197 to <2 x i32>
+ %4199 = shl i32 %4169, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4198, ptr addrspace(8) %3659, i32 %4199, i32 0, i32 0)
+ %4200 = insertelement <4 x half> poison, half %4180, i64 0
+ %4201 = insertelement <4 x half> %4200, half %4181, i64 1
+ %4202 = insertelement <4 x half> %4201, half %4182, i64 2
+ %4203 = insertelement <4 x half> %4202, half %4183, i64 3
+ %4204 = bitcast <4 x half> %4203 to <2 x i32>
+ %4205 = shl i32 %4170, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4204, ptr addrspace(8) %3659, i32 %4205, i32 0, i32 0)
+ %4206 = insertelement <4 x half> poison, half %4184, i64 0
+ %4207 = insertelement <4 x half> %4206, half %4185, i64 1
+ %4208 = insertelement <4 x half> %4207, half %4186, i64 2
+ %4209 = insertelement <4 x half> %4208, half %4187, i64 3
+ %4210 = bitcast <4 x half> %4209 to <2 x i32>
+ %4211 = shl i32 %4171, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4210, ptr addrspace(8) %3659, i32 %4211, i32 0, i32 0)
+ %4212 = add i32 %3637, %2301
+ %4213 = add i32 %3637, %2300
+ %4214 = add i32 %3638, %2301
+ %4215 = add i32 %3638, %2300
+ %4216 = fptrunc float %3508 to half
+ %4217 = fptrunc float %3509 to half
+ %4218 = fptrunc float %3510 to half
+ %4219 = fptrunc float %3511 to half
+ %4220 = fptrunc float %3514 to half
+ %4221 = fptrunc float %3515 to half
+ %4222 = fptrunc float %3516 to half
+ %4223 = fptrunc float %3517 to half
+ %4224 = fptrunc float %3520 to half
+ %4225 = fptrunc float %3521 to half
+ %4226 = fptrunc float %3522 to half
+ %4227 = fptrunc float %3523 to half
+ %4228 = fptrunc float %3526 to half
+ %4229 = fptrunc float %3527 to half
+ %4230 = fptrunc float %3528 to half
+ %4231 = fptrunc float %3529 to half
+ %4232 = insertelement <4 x half> poison, half %4216, i64 0
+ %4233 = insertelement <4 x half> %4232, half %4217, i64 1
+ %4234 = insertelement <4 x half> %4233, half %4218, i64 2
+ %4235 = insertelement <4 x half> %4234, half %4219, i64 3
+ %4236 = bitcast <4 x half> %4235 to <2 x i32>
+ %4237 = shl i32 %4212, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4236, ptr addrspace(8) %3659, i32 %4237, i32 0, i32 0)
+ %4238 = insertelement <4 x half> poison, half %4220, i64 0
+ %4239 = insertelement <4 x half> %4238, half %4221, i64 1
+ %4240 = insertelement <4 x half> %4239, half %4222, i64 2
+ %4241 = insertelement <4 x half> %4240, half %4223, i64 3
+ %4242 = bitcast <4 x half> %4241 to <2 x i32>
+ %4243 = shl i32 %4213, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4242, ptr addrspace(8) %3659, i32 %4243, i32 0, i32 0)
+ %4244 = insertelement <4 x half> poison, half %4224, i64 0
+ %4245 = insertelement <4 x half> %4244, half %4225, i64 1
+ %4246 = insertelement <4 x half> %4245, half %4226, i64 2
+ %4247 = insertelement <4 x half> %4246, half %4227, i64 3
+ %4248 = bitcast <4 x half> %4247 to <2 x i32>
+ %4249 = shl i32 %4214, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4248, ptr addrspace(8) %3659, i32 %4249, i32 0, i32 0)
+ %4250 = insertelement <4 x half> poison, half %4228, i64 0
+ %4251 = insertelement <4 x half> %4250, half %4229, i64 1
+ %4252 = insertelement <4 x half> %4251, half %4230, i64 2
+ %4253 = insertelement <4 x half> %4252, half %4231, i64 3
+ %4254 = bitcast <4 x half> %4253 to <2 x i32>
+ %4255 = shl i32 %4215, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4254, ptr addrspace(8) %3659, i32 %4255, i32 0, i32 0)
+ %4256 = add i32 %3637, %2299
+ %4257 = add i32 %3637, %2298
+ %4258 = add i32 %3638, %2299
+ %4259 = add i32 %3638, %2298
+ %4260 = fptrunc float %3580 to half
+ %4261 = fptrunc float %3581 to half
+ %4262 = fptrunc float %3582 to half
+ %4263 = fptrunc float %3583 to half
+ %4264 = fptrunc float %3586 to half
+ %4265 = fptrunc float %3587 to half
+ %4266 = fptrunc float %3588 to half
+ %4267 = fptrunc float %3589 to half
+ %4268 = fptrunc float %3592 to half
+ %4269 = fptrunc float %3593 to half
+ %4270 = fptrunc float %3594 to half
+ %4271 = fptrunc float %3595 to half
+ %4272 = fptrunc float %3598 to half
+ %4273 = fptrunc float %3599 to half
+ %4274 = fptrunc float %3600 to half
+ %4275 = fptrunc float %3601 to half
+ %4276 = insertelement <4 x half> poison, half %4260, i64 0
+ %4277 = insertelement <4 x half> %4276, half %4261, i64 1
+ %4278 = insertelement <4 x half> %4277, half %4262, i64 2
+ %4279 = insertelement <4 x half> %4278, half %4263, i64 3
+ %4280 = bitcast <4 x half> %4279 to <2 x i32>
+ %4281 = shl i32 %4256, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4280, ptr addrspace(8) %3659, i32 %4281, i32 0, i32 0)
+ %4282 = insertelement <4 x half> poison, half %4264, i64 0
+ %4283 = insertelement <4 x half> %4282, half %4265, i64 1
+ %4284 = insertelement <4 x half> %4283, half %4266, i64 2
+ %4285 = insertelement <4 x half> %4284, half %4267, i64 3
+ %4286 = bitcast <4 x half> %4285 to <2 x i32>
+ %4287 = shl i32 %4257, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4286, ptr addrspace(8) %3659, i32 %4287, i32 0, i32 0)
+ %4288 = insertelement <4 x half> poison, half %4268, i64 0
+ %4289 = insertelement <4 x half> %4288, half %4269, i64 1
+ %4290 = insertelement <4 x half> %4289, half %4270, i64 2
+ %4291 = insertelement <4 x half> %4290, half %4271, i64 3
+ %4292 = bitcast <4 x half> %4291 to <2 x i32>
+ %4293 = shl i32 %4258, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4292, ptr addrspace(8) %3659, i32 %4293, i32 0, i32 0)
+ %4294 = insertelement <4 x half> poison, half %4272, i64 0
+ %4295 = insertelement <4 x half> %4294, half %4273, i64 1
+ %4296 = insertelement <4 x half> %4295, half %4274, i64 2
+ %4297 = insertelement <4 x half> %4296, half %4275, i64 3
+ %4298 = bitcast <4 x half> %4297 to <2 x i32>
+ %4299 = shl i32 %4259, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4298, ptr addrspace(8) %3659, i32 %4299, i32 0, i32 0)
+ %4300 = add i32 %3637, %2297
+ %4301 = add i32 %3637, %2296
+ %4302 = add i32 %3638, %2297
+ %4303 = add i32 %3638, %2296
+ %4304 = fptrunc float %3604 to half
+ %4305 = fptrunc float %3605 to half
+ %4306 = fptrunc float %3606 to half
+ %4307 = fptrunc float %3607 to half
+ %4308 = fptrunc float %3610 to half
+ %4309 = fptrunc float %3611 to half
+ %4310 = fptrunc float %3612 to half
+ %4311 = fptrunc float %3613 to half
+ %4312 = fptrunc float %3616 to half
+ %4313 = fptrunc float %3617 to half
+ %4314 = fptrunc float %3618 to half
+ %4315 = fptrunc float %3619 to half
+ %4316 = fptrunc float %3622 to half
+ %4317 = fptrunc float %3623 to half
+ %4318 = fptrunc float %3624 to half
+ %4319 = fptrunc float %3625 to half
+ %4320 = insertelement <4 x half> poison, half %4304, i64 0
+ %4321 = insertelement <4 x half> %4320, half %4305, i64 1
+ %4322 = insertelement <4 x half> %4321, half %4306, i64 2
+ %4323 = insertelement <4 x half> %4322, half %4307, i64 3
+ %4324 = bitcast <4 x half> %4323 to <2 x i32>
+ %4325 = shl i32 %4300, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4324, ptr addrspace(8) %3659, i32 %4325, i32 0, i32 0)
+ %4326 = insertelement <4 x half> poison, half %4308, i64 0
+ %4327 = insertelement <4 x half> %4326, half %4309, i64 1
+ %4328 = insertelement <4 x half> %4327, half %4310, i64 2
+ %4329 = insertelement <4 x half> %4328, half %4311, i64 3
+ %4330 = bitcast <4 x half> %4329 to <2 x i32>
+ %4331 = shl i32 %4301, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4330, ptr addrspace(8) %3659, i32 %4331, i32 0, i32 0)
+ %4332 = insertelement <4 x half> poison, half %4312, i64 0
+ %4333 = insertelement <4 x half> %4332, half %4313, i64 1
+ %4334 = insertelement <4 x half> %4333, half %4314, i64 2
+ %4335 = insertelement <4 x half> %4334, half %4315, i64 3
+ %4336 = bitcast <4 x half> %4335 to <2 x i32>
+ %4337 = shl i32 %4302, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4336, ptr addrspace(8) %3659, i32 %4337, i32 0, i32 0)
+ %4338 = insertelement <4 x half> poison, half %4316, i64 0
+ %4339 = insertelement <4 x half> %4338, half %4317, i64 1
+ %4340 = insertelement <4 x half> %4339, half %4318, i64 2
+ %4341 = insertelement <4 x half> %4340, half %4319, i64 3
+ %4342 = bitcast <4 x half> %4341 to <2 x i32>
+ %4343 = shl i32 %4303, 1
+ tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %4342, ptr addrspace(8) %3659, i32 %4343, i32 0, i32 0)
+ ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.smin.i32(i32, i32) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.amdgcn.workitem.id.x() #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.assume(i1 noundef) #3
+
+; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn
+declare void @llvm.amdgcn.s.barrier() #4
+
+; Function Attrs: convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) #5
+
+; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #6
+
+; Function Attrs: convergent mustprogress nocallback nofree nounwind willreturn
+
+attributes #0 = { nofree norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0" "denormal-fp-math-f32"="ieee" "uniform-work-group-size"="false" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #4 = { convergent mustprogress nocallback nofree nounwind willreturn }
+attributes #5 = { convergent mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!3 = !DIFile(filename: "<unknown>", directory: "")
+!4 = distinct !DISubprogram(name: "matmul_kernel", linkageName: "matmul_kernel", scope: !3, file: !3, type: !5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 222, column: 7, scope: !4)
+!8 = !DILocation(line: 224, column: 7, scope: !4)
+!9 = !DILocation(line: 225, column: 7, scope: !4)
+!10 = !DILocation(line: 230, column: 7, scope: !4)
+!11 = !DILocation(line: 231, column: 7, scope: !4)
+!12 = !DILocation(line: 232, column: 7, scope: !4)
+!13 = !DILocation(line: 233, column: 7, scope: !4)
+!14 = !DILocation(line: 234, column: 7, scope: !4)
+!15 = !DILocation(line: 235, column: 7, scope: !4)
+!16 = !DILocation(line: 236, column: 7, scope: !4)
+!17 = !DILocation(line: 237, column: 7, scope: !4)
+!18 = !DILocation(line: 238, column: 7, scope: !4)
+!19 = !DILocation(line: 241, column: 7, scope: !4)
+!20 = !DILocation(line: 242, column: 7, scope: !4)
+!21 = !DILocation(line: 247, column: 7, scope: !4)
+!22 = !DILocation(line: 249, column: 7, scope: !4)
+!23 = !DILocation(line: 250, column: 7, scope: !4)
+!24 = !DILocation(line: 255, column: 7, scope: !4)
+!25 = !DILocation(line: 256, column: 7, scope: !4)
+!26 = !DILocation(line: 257, column: 7, scope: !4)
+!27 = !DILocation(line: 258, column: 7, scope: !4)
+!28 = !DILocation(line: 259, column: 7, scope: !4)
+!29 = !DILocation(line: 260, column: 7, scope: !4)
+!30 = !DILocation(line: 261, column: 7, scope: !4)
+!31 = !DILocation(line: 262, column: 7, scope: !4)
+!32 = !DILocation(line: 263, column: 7, scope: !4)
+!33 = !DILocation(line: 266, column: 7, scope: !4)
+!34 = !DILocation(line: 267, column: 7, scope: !4)
+!35 = !DILocation(line: 275, column: 7, scope: !4)
+!36 = !DILocation(line: 280, column: 7, scope: !4)
+!37 = !DILocation(line: 285, column: 7, scope: !4)
+!38 = !DILocation(line: 286, column: 7, scope: !4)
+!39 = !DILocation(line: 287, column: 7, scope: !4)
+!40 = !DILocation(line: 288, column: 7, scope: !4)
+!41 = !DILocation(line: 289, column: 7, scope: !4)
+!42 = !DILocation(line: 290, column: 7, scope: !4)
+!43 = !DILocation(line: 291, column: 7, scope: !4)
+!44 = !DILocation(line: 292, column: 7, scope: !4)
+!45 = !DILocation(line: 293, column: 7, scope: !4)
+!46 = !DILocation(line: 298, column: 7, scope: !4)
+!47 = !DILocation(line: 299, column: 7, scope: !4)
+!48 = !DILocation(line: 300, column: 7, scope: !4)
+!49 = !DILocation(line: 301, column: 7, scope: !4)
+!50 = !DILocation(line: 302, column: 7, scope: !4)
+!51 = !DILocation(line: 303, column: 7, scope: !4)
+!52 = !DILocation(line: 304, column: 7, scope: !4)
+!53 = !DILocation(line: 305, column: 7, scope: !4)
+!54 = !DILocation(line: 306, column: 7, scope: !4)
+!55 = !DILocation(line: 309, column: 7, scope: !4)
+!56 = !DILocation(line: 310, column: 7, scope: !4)
+!57 = !DILocation(line: 315, column: 7, scope: !4)
+!58 = !DILocation(line: 316, column: 7, scope: !4)
+!59 = !DILocation(line: 321, column: 7, scope: !4)
+!60 = !DILocation(line: 322, column: 7, scope: !4)
+!61 = !DILocation(line: 323, column: 7, scope: !4)
+!62 = !DILocation(line: 324, column: 7, scope: !4)
+!63 = !DILocation(line: 325, column: 7, scope: !4)
+!64 = !DILocation(line: 326, column: 7, scope: !4)
+!65 = !DILocation(line: 327, column: 7, scope: !4)
+!66 = !DILocation(line: 328, column: 7, scope: !4)
+!67 = !DILocation(line: 329, column: 7, scope: !4)
+!68 = !DILocation(line: 334, column: 7, scope: !4)
+!69 = !DILocation(line: 335, column: 7, scope: !4)
+!70 = !DILocation(line: 336, column: 7, scope: !4)
+!71 = !DILocation(line: 337, column: 7, scope: !4)
+!72 = !DILocation(line: 338, column: 7, scope: !4)
+!73 = !DILocation(line: 339, column: 7, scope: !4)
+!74 = !DILocation(line: 340, column: 7, scope: !4)
+!75 = !DILocation(line: 341, column: 7, scope: !4)
+!76 = !DILocation(line: 342, column: 7, scope: !4)
+!77 = !DILocation(line: 347, column: 7, scope: !4)
+!78 = !DILocation(line: 348, column: 7, scope: !4)
+!79 = !DILocation(line: 364, column: 7, scope: !4)
+!80 = !DILocation(line: 365, column: 7, scope: !4)
+!81 = !DILocation(line: 366, column: 7, scope: !4)
+!82 = !DILocation(line: 367, column: 7, scope: !4)
+!83 = !DILocation(line: 368, column: 7, scope: !4)
+!84 = !DILocation(line: 369, column: 7, scope: !4)
+!85 = !DILocation(line: 370, column: 7, scope: !4)
+!86 = !DILocation(line: 371, column: 7, scope: !4)
+!87 = !DILocation(line: 372, column: 7, scope: !4)
+!88 = !DILocation(line: 373, column: 7, scope: !4)
+!89 = !DILocation(line: 374, column: 7, scope: !4)
+!90 = !DILocation(line: 375, column: 7, scope: !4)
+!91 = !DILocation(line: 376, column: 7, scope: !4)
+!92 = !DILocation(line: 377, column: 7, scope: !4)
+!93 = !DILocation(line: 378, column: 7, scope: !4)
+!94 = !DILocation(line: 379, column: 7, scope: !4)
+!95 = !DILocation(line: 380, column: 7, scope: !4)
+!96 = !DILocation(line: 381, column: 7, scope: !4)
+!97 = !DILocation(line: 382, column: 7, scope: !4)
+!98 = !DILocation(line: 383, column: 7, scope: !4)
+!99 = !DILocation(line: 384, column: 7, scope: !4)
+!100 = !DILocation(line: 385, column: 7, scope: !4)
+!101 = !DILocation(line: 386, column: 7, scope: !4)
+!102 = !DILocation(line: 387, column: 7, scope: !4)
+!103 = !DILocation(line: 388, column: 7, scope: !4)
+!104 = !DILocation(line: 389, column: 7, scope: !4)
+!105 = !DILocation(line: 390, column: 7, scope: !4)
+!106 = !DILocation(line: 391, column: 7, scope: !4)
+!107 = !DILocation(line: 392, column: 7, scope: !4)
+!108 = !DILocation(line: 393, column: 7, scope: !4)
+!109 = !DILocation(line: 394, column: 7, scope: !4)
+!110 = !DILocation(line: 395, column: 7, scope: !4)
+!111 = !DILocation(line: 396, column: 7, scope: !4)
+!112 = !DILocation(line: 412, column: 7, scope: !4)
+!113 = !DILocation(line: 413, column: 7, scope: !4)
+!114 = !DILocation(line: 414, column: 7, scope: !4)
+!115 = !DILocation(line: 415, column: 7, scope: !4)
+!116 = !DILocation(line: 416, column: 7, scope: !4)
+!117 = !DILocation(line: 417, column: 7, scope: !4)
+!118 = !DILocation(line: 418, column: 7, scope: !4)
+!119 = !DILocation(line: 419, column: 7, scope: !4)
+!120 = !DILocation(line: 420, column: 7, scope: !4)
+!121 = !DILocation(line: 421, column: 7, scope: !4)
+!122 = !DILocation(line: 422, column: 7, scope: !4)
+!123 = !DILocation(line: 423, column: 7, scope: !4)
+!124 = !DILocation(line: 424, column: 7, scope: !4)
+!125 = !DILocation(line: 425, column: 7, scope: !4)
+!126 = !DILocation(line: 426, column: 7, scope: !4)
+!127 = !DILocation(line: 427, column: 7, scope: !4)
+!128 = !DILocation(line: 428, column: 7, scope: !4)
+!129 = !DILocation(line: 429, column: 7, scope: !4)
+!130 = !DILocation(line: 430, column: 7, scope: !4)
+!131 = !DILocation(line: 431, column: 7, scope: !4)
+!132 = !DILocation(line: 432, column: 7, scope: !4)
+!133 = !DILocation(line: 433, column: 7, scope: !4)
+!134 = !DILocation(line: 434, column: 7, scope: !4)
+!135 = !DILocation(line: 435, column: 7, scope: !4)
+!136 = !DILocation(line: 436, column: 7, scope: !4)
+!137 = !DILocation(line: 437, column: 7, scope: !4)
+!138 = !DILocation(line: 438, column: 7, scope: !4)
+!139 = !DILocation(line: 439, column: 7, scope: !4)
+!140 = !DILocation(line: 440, column: 7, scope: !4)
+!141 = !DILocation(line: 441, column: 7, scope: !4)
+!142 = !DILocation(line: 442, column: 7, scope: !4)
+!143 = !DILocation(line: 443, column: 7, scope: !4)
+!144 = !DILocation(line: 444, column: 7, scope: !4)
+!145 = !DILocation(line: 449, column: 7, scope: !4)
+!146 = !DILocation(line: 450, column: 7, scope: !4)
+!147 = !DILocation(line: 456, column: 7, scope: !4)
+!148 = !DILocation(line: 457, column: 7, scope: !4)
+!149 = !DILocation(line: 458, column: 7, scope: !4)
+!150 = !DILocation(line: 459, column: 7, scope: !4)
+!151 = !DILocation(line: 460, column: 7, scope: !4)
+!152 = !DILocation(line: 461, column: 7, scope: !4)
+!153 = !DILocation(line: 462, column: 7, scope: !4)
+!154 = !DILocation(line: 463, column: 7, scope: !4)
+!155 = !DILocation(line: 464, column: 7, scope: !4)
+!156 = !DILocation(line: 469, column: 7, scope: !4)
+!157 = !DILocation(line: 470, column: 7, scope: !4)
+!158 = !DILocation(line: 471, column: 7, scope: !4)
+!159 = !DILocation(line: 472, column: 7, scope: !4)
+!160 = !DILocation(line: 473, column: 7, scope: !4)
+!161 = !DILocation(line: 474, column: 7, scope: !4)
+!162 = !DILocation(line: 475, column: 7, scope: !4)
+!163 = !DILocation(line: 476, column: 7, scope: !4)
+!164 = !DILocation(line: 477, column: 7, scope: !4)
+!165 = !DILocation(line: 480, column: 7, scope: !4)
+!166 = !DILocation(line: 481, column: 7, scope: !4)
+!167 = !DILocation(line: 166, column: 9, scope: !4)
+!168 = !DILocation(line: 174, column: 9, scope: !4)
+!169 = !DILocation(line: 175, column: 9, scope: !4)
+!170 = !DILocation(line: 176, column: 9, scope: !4)
+!171 = !DILocation(line: 177, column: 9, scope: !4)
+!172 = !DILocation(line: 178, column: 9, scope: !4)
+!173 = !DILocation(line: 179, column: 9, scope: !4)
+!174 = !DILocation(line: 180, column: 9, scope: !4)
+!175 = !DILocation(line: 181, column: 9, scope: !4)
+!176 = !DILocation(line: 182, column: 9, scope: !4)
+!177 = !DILocation(line: 183, column: 9, scope: !4)
+!178 = !DILocation(line: 184, column: 9, scope: !4)
+!179 = !DILocation(line: 185, column: 9, scope: !4)
+!180 = !DILocation(line: 186, column: 9, scope: !4)
+!181 = !DILocation(line: 187, column: 9, scope: !4)
+!182 = !DILocation(line: 188, column: 9, scope: !4)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.max.ll
new file mode 100644
index 0000000000000..74edd5a7a227b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.max.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_iglp_opt_mfma_gemm:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
+; GCN-NEXT: v_mov_b32_e32 v3, 2.0
+; GCN-NEXT: ; iglp_opt mask(0x00000000)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_u32_e32 v1, s0, v0
+; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1
+; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456
+; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440
+; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424
+; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408
+; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344
+; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360
+; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376
+; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392
+; GCN-NEXT: v_mov_b32_e32 v2, 1.0
+; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264
+; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248
+; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232
+; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216
+; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200
+; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184
+; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168
+; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152
+; GCN-NEXT: s_waitcnt lgkmcnt(8)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112
+; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96
+; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592
+; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576
+; GCN-NEXT: v_add_u32_e32 v0, s1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(4)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80
+; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64
+; GCN-NEXT: ds_read_b128 a[128:131], v1
+; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16
+; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32
+; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
+; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304
+; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288
+; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272
+; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256
+; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240
+; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224
+; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208
+; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
+; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688
+; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672
+; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656
+; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640
+; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624
+; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112
+; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96
+; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80
+; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64
+; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48
+; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32
+; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16
+; GCN-NEXT: ds_write_b128 v0, a[128:131]
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: s_waitcnt lgkmcnt(8)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
+; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672
+; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688
+; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640
+; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288
+; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304
+; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256
+; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272
+; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224
+; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240
+; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192
+; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208
+; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656
+; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608
+; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624
+; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576
+; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592
+; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
+; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
+; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
+; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464
+; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416
+; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
+; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
+; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
+; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784
+; GCN-NEXT: s_endpgm
+entry:
+ call void @llvm.amdgcn.iglp.opt(i32 4)
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
+ %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr
+ %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
+ %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr
+ %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
+ %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr
+ %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
+ %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr
+ %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
+ %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr
+ %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0)
+ %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0)
+ %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0)
+ %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0)
+ %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0)
+ %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
+ store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr
+ %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
+ store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr
+ %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
+ store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr
+ %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
+ store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr
+ %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
+ store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr
+ ret void
+}
More information about the llvm-commits
mailing list