[llvm] [AMDGPU] Fix typo in v_dot4 combine (PR #115224)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 18:53:24 PST 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/115224
>From dd740f1f451a741324e1cfb4c58a5a3af76a2b91 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 13:59:38 -0800
Subject: [PATCH 1/6] [AMDGPU] Fix typo in v_dot4 combine
Change-Id: Ifc201f58eddd8f8994690bacbf34f446ccf2a790
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
llvm/test/CodeGen/AMDGPU/idot4s.ll | 846 ++++++++++++++++++++++
2 files changed, 847 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1a962e68c587c7..419414e5bd993d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14011,7 +14011,7 @@ static void placeSources(ByteProvider<SDValue> &Src0,
Src0s.push_back(
{*Src0.Src,
((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
- Src1.SrcOffset / 4});
+ Src0.SrcOffset / 4});
Src1s.push_back(
{*Src1.Src,
((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 108d85e024ad76..15734094db42cd 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3450,4 +3450,850 @@ entry:
}
+define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2) local_unnamed_addr {
+; GFX7-LABEL: ByteOffsetCorrectness:
+; GFX7: ; %bb.0: ; %.entry
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7-NEXT: s_cbranch_execz .LBB17_5
+; GFX7-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v2
+; GFX7-NEXT: v_mul_hi_u32_u24_e32 v2, 0x48, v0
+; GFX7-NEXT: v_mul_u32_u24_e32 v1, 0x48, v0
+; GFX7-NEXT: s_movk_i32 s0, 0x900
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v3, s0, v[1:2]
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 5, v3
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, v6, v0
+; GFX7-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, s11
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, s10, v4
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GFX7-NEXT: v_mov_b32_e32 v6, s9
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, s8, v4
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x48
+; GFX7-NEXT: s_movk_i32 s10, 0xffe1
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v7, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX7-NEXT: s_mov_b32 s11, -1
+; GFX7-NEXT: s_mov_b64 s[12:13], 0
+; GFX7-NEXT: .LBB17_2: ; %.lr.ph
+; GFX7-NEXT: ; =>This Loop Header: Depth=1
+; GFX7-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX7-NEXT: v_mov_b32_e32 v8, 0
+; GFX7-NEXT: s_mov_b64 s[0:1], s[8:9]
+; GFX7-NEXT: .LBB17_3: ; %.preheader2
+; GFX7-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT: buffer_load_sbyte v9, v[4:5], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_load_sbyte v10, v[4:5], s[0:3], 0 addr64 offset:1
+; GFX7-NEXT: buffer_load_sbyte v11, v[4:5], s[0:3], 0 addr64 offset:2
+; GFX7-NEXT: buffer_load_sbyte v12, v[4:5], s[0:3], 0 addr64 offset:3
+; GFX7-NEXT: buffer_load_sbyte v13, v[4:5], s[0:3], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_sbyte v14, v[4:5], s[0:3], 0 addr64 offset:5
+; GFX7-NEXT: buffer_load_sbyte v15, v[4:5], s[0:3], 0 addr64 offset:6
+; GFX7-NEXT: buffer_load_sbyte v16, v[4:5], s[0:3], 0 addr64 offset:7
+; GFX7-NEXT: buffer_load_sbyte v17, v[4:5], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT: buffer_load_sbyte v18, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_load_sbyte v19, v[0:1], s[0:3], 0 addr64 offset:1
+; GFX7-NEXT: buffer_load_sbyte v20, v[0:1], s[0:3], 0 addr64 offset:2
+; GFX7-NEXT: buffer_load_sbyte v21, v[0:1], s[0:3], 0 addr64 offset:3
+; GFX7-NEXT: buffer_load_sbyte v22, v[0:1], s[0:3], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_sbyte v23, v[0:1], s[0:3], 0 addr64 offset:5
+; GFX7-NEXT: buffer_load_sbyte v24, v[0:1], s[0:3], 0 addr64 offset:6
+; GFX7-NEXT: buffer_load_sbyte v25, v[0:1], s[0:3], 0 addr64 offset:7
+; GFX7-NEXT: buffer_load_sbyte v26, v[0:1], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT: s_add_u32 s0, s0, 9
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, s[0:1], v[6:7]
+; GFX7-NEXT: s_and_b64 vcc, exec, vcc
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_mad_i32_i24 v8, v18, v9, v8
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_mad_i32_i24 v8, v19, v10, v8
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mad_i32_i24 v8, v20, v11, v8
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_mad_i32_i24 v8, v21, v12, v8
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mad_i32_i24 v8, v22, v13, v8
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mad_i32_i24 v8, v23, v14, v8
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mad_i32_i24 v8, v24, v15, v8
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mad_i32_i24 v8, v25, v16, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v8, v26, v17, v8
+; GFX7-NEXT: s_cbranch_vccnz .LBB17_3
+; GFX7-NEXT: ; %bb.4: ; %.110
+; GFX7-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX7-NEXT: v_lshl_b64 v[9:10], v[2:3], 2
+; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], v[2:3]
+; GFX7-NEXT: buffer_store_dword v8, v[9:10], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v2
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x900, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v8
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX7-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13]
+; GFX7-NEXT: v_mov_b32_e32 v3, v9
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[12:13]
+; GFX7-NEXT: s_cbranch_execnz .LBB17_2
+; GFX7-NEXT: .LBB17_5: ; %._crit_edge
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: ByteOffsetCorrectness:
+; GFX8: ; %bb.0: ; %.entry
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB17_5
+; GFX8-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s0, 0x900
+; GFX8-NEXT: v_mul_hi_u32_u24_e32 v4, 0x900, v3
+; GFX8-NEXT: v_mul_u32_u24_e32 v3, 0x900, v3
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, s0, v[3:4]
+; GFX8-NEXT: s_movk_i32 s0, 0x48
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s0, v[1:2]
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 5, v5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
+; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, 0, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s6, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v4, v3, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v2
+; GFX8-NEXT: s_movk_i32 s4, 0xffe1
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v4, v3, vcc
+; GFX8-NEXT: s_mov_b32 s5, -1
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB17_2: ; %.lr.ph
+; GFX8-NEXT: ; =>This Loop Header: Depth=1
+; GFX8-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX8-NEXT: v_mov_b32_e32 v10, 0
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB17_3: ; %.preheader2
+; GFX8-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v8
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v9, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v6
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v11, v[4:5]
+; GFX8-NEXT: flat_load_sbyte v12, v[2:3]
+; GFX8-NEXT: s_add_u32 s0, s0, 9
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0x48
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v12, v11, v10
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 8, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v2
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v4, v[4:5]
+; GFX8-NEXT: flat_load_sbyte v2, v[2:3]
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_mad_i32_i24 v10, v10, v13, v12
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v10, v2, v4, v10
+; GFX8-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX8-NEXT: ; %bb.4: ; %.110
+; GFX8-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT: flat_store_dword v[2:3], v10
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x900, v6
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x900, v8
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB17_2
+; GFX8-NEXT: .LBB17_5: ; %._crit_edge
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-NODL-LABEL: ByteOffsetCorrectness:
+; GFX9-NODL: ; %bb.0: ; %.entry
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NODL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX9-NODL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NODL-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-NODL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX9-NODL-NEXT: v_add_u32_e32 v10, v3, v2
+; GFX9-NODL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX9-NODL-NEXT: s_movk_i32 s3, 0x900
+; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2
+; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0
+; GFX9-NODL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
+; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0
+; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0
+; GFX9-NODL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
+; GFX9-NODL-NEXT: s_movk_i32 s2, 0x48
+; GFX9-NODL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
+; GFX9-NODL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s9
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6
+; GFX9-NODL-NEXT: s_movk_i32 s6, 0xffe1
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX9-NODL-NEXT: s_mov_b32 s7, -1
+; GFX9-NODL-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NODL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX9-NODL-NEXT: ; =>This Loop Header: Depth=1
+; GFX9-NODL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NODL-NEXT: s_mov_b64 s[10:11], 0
+; GFX9-NODL-NEXT: .LBB17_3: ; %.preheader2
+; GFX9-NODL-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX9-NODL-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v12, s11
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s10, v6
+; GFX9-NODL-NEXT: v_add_co_u32_e64 v11, s[0:1], s10, v0
+; GFX9-NODL-NEXT: v_add_co_u32_e64 v13, s[2:3], s10, v2
+; GFX9-NODL-NEXT: v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
+; GFX9-NODL-NEXT: v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
+; GFX9-NODL-NEXT: global_load_sbyte v15, v[13:14], off
+; GFX9-NODL-NEXT: global_load_sbyte v16, v[11:12], off offset:1
+; GFX9-NODL-NEXT: global_load_sbyte v17, v[11:12], off offset:2
+; GFX9-NODL-NEXT: global_load_sbyte v18, v[11:12], off offset:3
+; GFX9-NODL-NEXT: global_load_sbyte v19, v[11:12], off offset:4
+; GFX9-NODL-NEXT: global_load_sbyte v20, v[11:12], off offset:5
+; GFX9-NODL-NEXT: global_load_sbyte v21, v[11:12], off offset:6
+; GFX9-NODL-NEXT: global_load_sbyte v22, v[11:12], off offset:7
+; GFX9-NODL-NEXT: global_load_sbyte v23, v[9:10], off
+; GFX9-NODL-NEXT: global_load_sbyte v24, v[9:10], off offset:1
+; GFX9-NODL-NEXT: global_load_sbyte v25, v[9:10], off offset:2
+; GFX9-NODL-NEXT: global_load_sbyte v26, v[9:10], off offset:3
+; GFX9-NODL-NEXT: global_load_sbyte v27, v[9:10], off offset:4
+; GFX9-NODL-NEXT: global_load_sbyte v28, v[9:10], off offset:5
+; GFX9-NODL-NEXT: global_load_sbyte v29, v[9:10], off offset:6
+; GFX9-NODL-NEXT: ; kill: killed $vgpr11 killed $vgpr12
+; GFX9-NODL-NEXT: global_load_sbyte v11, v[9:10], off offset:7
+; GFX9-NODL-NEXT: global_load_sbyte v12, v[13:14], off offset:8
+; GFX9-NODL-NEXT: global_load_sbyte v30, v[9:10], off offset:8
+; GFX9-NODL-NEXT: s_add_u32 s10, s10, 9
+; GFX9-NODL-NEXT: s_addc_u32 s11, s11, 0
+; GFX9-NODL-NEXT: s_cmp_lg_u64 s[10:11], 0x48
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v23, v15, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v24, v16, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v25, v17, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v26, v18, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v27, v19, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v28, v20, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v29, v21, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v11, v22, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v30, v12, v8
+; GFX9-NODL-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX9-NODL-NEXT: ; %bb.4: ; %.110
+; GFX9-NODL-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX9-NODL-NEXT: v_lshlrev_b64 v[9:10], 2, v[4:5]
+; GFX9-NODL-NEXT: v_mov_b32_e32 v11, s5
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s4, v9
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
+; GFX9-NODL-NEXT: global_store_dword v[9:10], v8, off
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v8, vcc, 32, v4
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, 0x900, v0
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, 0x900, v2
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NODL-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[4:5]
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, 0x900, v6
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, v8
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-NODL-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-NODL-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NODL-NEXT: s_cbranch_execnz .LBB17_2
+; GFX9-NODL-NEXT: .LBB17_5: ; %._crit_edge
+; GFX9-NODL-NEXT: s_endpgm
+;
+; GFX9-DL-LABEL: ByteOffsetCorrectness:
+; GFX9-DL: ; %bb.0: ; %.entry
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX9-DL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DL-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX9-DL-NEXT: v_add_u32_e32 v10, v3, v2
+; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX9-DL-NEXT: s_movk_i32 s3, 0x900
+; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2
+; GFX9-DL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0
+; GFX9-DL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
+; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0
+; GFX9-DL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
+; GFX9-DL-NEXT: s_movk_i32 s2, 0x48
+; GFX9-DL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
+; GFX9-DL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9
+; GFX9-DL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
+; GFX9-DL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6
+; GFX9-DL-NEXT: s_movk_i32 s8, 0xffe1
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX9-DL-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-DL-NEXT: s_mov_b32 s12, 0xc0c0400
+; GFX9-DL-NEXT: s_mov_b32 s9, -1
+; GFX9-DL-NEXT: s_mov_b32 s13, 0x4000c0c
+; GFX9-DL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX9-DL-NEXT: ; =>This Loop Header: Depth=1
+; GFX9-DL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-DL-NEXT: s_mov_b64 s[10:11], 0
+; GFX9-DL-NEXT: .LBB17_3: ; %.preheader2
+; GFX9-DL-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX9-DL-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX9-DL-NEXT: v_mov_b32_e32 v12, s11
+; GFX9-DL-NEXT: v_add_co_u32_e32 v9, vcc, s10, v6
+; GFX9-DL-NEXT: v_add_co_u32_e64 v11, s[0:1], s10, v0
+; GFX9-DL-NEXT: v_add_co_u32_e64 v13, s[2:3], s10, v2
+; GFX9-DL-NEXT: v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
+; GFX9-DL-NEXT: v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
+; GFX9-DL-NEXT: global_load_sbyte v15, v[11:12], off offset:1
+; GFX9-DL-NEXT: global_load_sbyte v16, v[11:12], off offset:2
+; GFX9-DL-NEXT: global_load_sbyte v17, v[11:12], off offset:3
+; GFX9-DL-NEXT: global_load_sbyte v18, v[11:12], off offset:4
+; GFX9-DL-NEXT: global_load_sbyte v19, v[11:12], off offset:5
+; GFX9-DL-NEXT: global_load_sbyte v20, v[11:12], off offset:6
+; GFX9-DL-NEXT: global_load_sbyte v21, v[11:12], off offset:7
+; GFX9-DL-NEXT: global_load_sbyte v22, v[13:14], off
+; GFX9-DL-NEXT: global_load_sbyte v23, v[13:14], off offset:8
+; GFX9-DL-NEXT: global_load_sbyte v24, v[9:10], off
+; GFX9-DL-NEXT: global_load_sbyte v25, v[9:10], off offset:1
+; GFX9-DL-NEXT: global_load_sbyte v26, v[9:10], off offset:2
+; GFX9-DL-NEXT: global_load_sbyte v27, v[9:10], off offset:3
+; GFX9-DL-NEXT: global_load_sbyte v28, v[9:10], off offset:4
+; GFX9-DL-NEXT: global_load_sbyte v29, v[9:10], off offset:5
+; GFX9-DL-NEXT: ; kill: killed $vgpr13 killed $vgpr14
+; GFX9-DL-NEXT: ; kill: killed $vgpr11 killed $vgpr12
+; GFX9-DL-NEXT: global_load_sbyte v11, v[9:10], off offset:6
+; GFX9-DL-NEXT: global_load_sbyte v12, v[9:10], off offset:7
+; GFX9-DL-NEXT: global_load_sbyte v13, v[9:10], off offset:8
+; GFX9-DL-NEXT: s_add_u32 s10, s10, 9
+; GFX9-DL-NEXT: s_addc_u32 s11, s11, 0
+; GFX9-DL-NEXT: s_cmp_lg_u64 s[10:11], 0x48
+; GFX9-DL-NEXT: s_waitcnt vmcnt(16)
+; GFX9-DL-NEXT: v_perm_b32 v9, v16, v15, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(14)
+; GFX9-DL-NEXT: v_perm_b32 v10, v18, v17, s13
+; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX9-DL-NEXT: s_waitcnt vmcnt(12)
+; GFX9-DL-NEXT: v_perm_b32 v16, v20, v19, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(9)
+; GFX9-DL-NEXT: v_perm_b32 v17, v23, v21, s13
+; GFX9-DL-NEXT: s_waitcnt vmcnt(8)
+; GFX9-DL-NEXT: v_mad_i32_i24 v8, v24, v22, v8
+; GFX9-DL-NEXT: s_waitcnt vmcnt(6)
+; GFX9-DL-NEXT: v_perm_b32 v14, v26, v25, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(4)
+; GFX9-DL-NEXT: v_perm_b32 v15, v28, v27, s13
+; GFX9-DL-NEXT: v_or_b32_e32 v10, v15, v14
+; GFX9-DL-NEXT: v_dot4_i32_i8 v8, v10, v9, v8
+; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
+; GFX9-DL-NEXT: v_perm_b32 v11, v11, v29, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_perm_b32 v12, v13, v12, s13
+; GFX9-DL-NEXT: v_or_b32_e32 v13, v17, v16
+; GFX9-DL-NEXT: v_or_b32_e32 v11, v12, v11
+; GFX9-DL-NEXT: v_dot4_i32_i8 v8, v11, v13, v8
+; GFX9-DL-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX9-DL-NEXT: ; %bb.4: ; %.110
+; GFX9-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX9-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[4:5]
+; GFX9-DL-NEXT: v_mov_b32_e32 v11, s5
+; GFX9-DL-NEXT: v_add_co_u32_e32 v9, vcc, s4, v9
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
+; GFX9-DL-NEXT: global_store_dword v[9:10], v8, off
+; GFX9-DL-NEXT: v_add_co_u32_e32 v8, vcc, 32, v4
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-DL-NEXT: v_add_co_u32_e32 v0, vcc, 0x900, v0
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-DL-NEXT: v_add_co_u32_e32 v2, vcc, 0x900, v2
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-DL-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], v[4:5]
+; GFX9-DL-NEXT: v_add_co_u32_e32 v6, vcc, 0x900, v6
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, v8
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-DL-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-DL-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-DL-NEXT: s_cbranch_execnz .LBB17_2
+; GFX9-DL-NEXT: .LBB17_5: ; %._crit_edge
+; GFX9-DL-NEXT: s_endpgm
+;
+; GFX10-DL-LABEL: ByteOffsetCorrectness:
+; GFX10-DL: ; %bb.0: ; %.entry
+; GFX10-DL-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-DL-NEXT: v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1]
+; GFX10-DL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX10-DL-NEXT: s_cbranch_execz .LBB17_5
+; GFX10-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX10-DL-NEXT: v_mul_hi_u32_u24_e32 v5, 0x900, v2
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, 0x900, v2
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v6, v3, v2
+; GFX10-DL-NEXT: v_mul_hi_u32_u24_e32 v2, 0x48, v0
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, 0x48, v0
+; GFX10-DL-NEXT: s_movk_i32 s2, 0xffe1
+; GFX10-DL-NEXT: v_mad_u64_u32 v[3:4], s0, 0x900, v3, v[4:5]
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 5, v6
+; GFX10-DL-NEXT: v_mad_u64_u32 v[6:7], s0, 0x900, v6, v[1:2]
+; GFX10-DL-NEXT: s_mov_b32 s3, -1
+; GFX10-DL-NEXT: s_mov_b32 s6, 0
+; GFX10-DL-NEXT: v_mad_u64_u32 v[4:5], s0, 0x48, v0, v[3:4]
+; GFX10-DL-NEXT: v_add_co_u32 v0, s0, v8, v0
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: v_add_co_u32 v2, vcc_lo, s8, v6
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s9, v7, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v4, vcc_lo, s8, v4
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s9, v5, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v6, vcc_lo, s10, v6
+; GFX10-DL-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, 0, s0
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s11, v7, vcc_lo
+; GFX10-DL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX10-DL-NEXT: ; =>This Loop Header: Depth=1
+; GFX10-DL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-DL-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-DL-NEXT: .LBB17_3: ; %.preheader2
+; GFX10-DL-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX10-DL-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX10-DL-NEXT: v_add_co_u32 v9, vcc_lo, v4, s0
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v11, vcc_lo, v2, s0
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v13, vcc_lo, v6, s0
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
+; GFX10-DL-NEXT: s_clause 0x6
+; GFX10-DL-NEXT: global_load_sbyte v15, v[11:12], off offset:1
+; GFX10-DL-NEXT: global_load_sbyte v16, v[11:12], off offset:2
+; GFX10-DL-NEXT: global_load_sbyte v17, v[11:12], off offset:3
+; GFX10-DL-NEXT: global_load_sbyte v18, v[11:12], off offset:4
+; GFX10-DL-NEXT: global_load_sbyte v19, v[11:12], off offset:5
+; GFX10-DL-NEXT: global_load_sbyte v20, v[11:12], off offset:6
+; GFX10-DL-NEXT: global_load_sbyte v21, v[11:12], off offset:7
+; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: global_load_sbyte v22, v[9:10], off
+; GFX10-DL-NEXT: global_load_sbyte v23, v[9:10], off offset:8
+; GFX10-DL-NEXT: s_clause 0x8
+; GFX10-DL-NEXT: global_load_sbyte v24, v[13:14], off
+; GFX10-DL-NEXT: global_load_sbyte v25, v[13:14], off offset:1
+; GFX10-DL-NEXT: global_load_sbyte v26, v[13:14], off offset:2
+; GFX10-DL-NEXT: global_load_sbyte v27, v[13:14], off offset:3
+; GFX10-DL-NEXT: global_load_sbyte v28, v[13:14], off offset:4
+; GFX10-DL-NEXT: global_load_sbyte v29, v[13:14], off offset:5
+; GFX10-DL-NEXT: ; meta instruction
+; GFX10-DL-NEXT: ; meta instruction
+; GFX10-DL-NEXT: global_load_sbyte v9, v[13:14], off offset:6
+; GFX10-DL-NEXT: global_load_sbyte v10, v[13:14], off offset:7
+; GFX10-DL-NEXT: global_load_sbyte v11, v[13:14], off offset:8
+; GFX10-DL-NEXT: s_add_u32 s0, s0, 9
+; GFX10-DL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-DL-NEXT: s_cmp_lg_u64 s[0:1], 0x48
+; GFX10-DL-NEXT: s_waitcnt vmcnt(16)
+; GFX10-DL-NEXT: v_perm_b32 v12, v16, v15, 0xc0c0400
+; GFX10-DL-NEXT: s_waitcnt vmcnt(14)
+; GFX10-DL-NEXT: v_perm_b32 v13, v18, v17, 0x4000c0c
+; GFX10-DL-NEXT: s_waitcnt vmcnt(12)
+; GFX10-DL-NEXT: v_perm_b32 v16, v20, v19, 0xc0c0400
+; GFX10-DL-NEXT: s_waitcnt vmcnt(9)
+; GFX10-DL-NEXT: v_perm_b32 v17, v23, v21, 0x4000c0c
+; GFX10-DL-NEXT: s_waitcnt vmcnt(8)
+; GFX10-DL-NEXT: v_mad_i32_i24 v8, v24, v22, v8
+; GFX10-DL-NEXT: s_waitcnt vmcnt(6)
+; GFX10-DL-NEXT: v_perm_b32 v14, v26, v25, 0xc0c0400
+; GFX10-DL-NEXT: s_waitcnt vmcnt(4)
+; GFX10-DL-NEXT: v_perm_b32 v15, v28, v27, 0x4000c0c
+; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
+; GFX10-DL-NEXT: v_perm_b32 v9, v9, v29, 0xc0c0400
+; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_perm_b32 v10, v11, v10, 0x4000c0c
+; GFX10-DL-NEXT: v_or_b32_e32 v11, v13, v12
+; GFX10-DL-NEXT: v_or_b32_e32 v12, v15, v14
+; GFX10-DL-NEXT: v_or_b32_e32 v13, v17, v16
+; GFX10-DL-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX10-DL-NEXT: v_dot4c_i32_i8 v8, v12, v11
+; GFX10-DL-NEXT: v_dot4c_i32_i8 v8, v9, v13
+; GFX10-DL-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX10-DL-NEXT: ; %bb.4: ; %.110
+; GFX10-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX10-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[0:1]
+; GFX10-DL-NEXT: v_add_co_u32 v6, s0, 0x900, v6
+; GFX10-DL-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX10-DL-NEXT: v_add_co_u32 v9, vcc_lo, s4, v9
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s5, v10, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 32
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v2, vcc_lo, 0x900, v2
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v4, vcc_lo, 0x900, v4
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX10-DL-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, v11
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, v12
+; GFX10-DL-NEXT: global_store_dword v[9:10], v8, off
+; GFX10-DL-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-DL-NEXT: s_cbranch_execnz .LBB17_2
+; GFX10-DL-NEXT: .LBB17_5: ; %._crit_edge
+; GFX10-DL-NEXT: s_endpgm
+;
+; GFX11-DL-LABEL: ByteOffsetCorrectness:
+; GFX11-DL: ; %bb.0: ; %.entry
+; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-DL-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT: v_cmpx_gt_i64_e32 2, v[1:2]
+; GFX11-DL-NEXT: s_cbranch_execz .LBB17_5
+; GFX11-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 20, 10
+; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-DL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-DL-NEXT: v_mul_hi_u32_u24_e32 v4, 0x48, v1
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[2:3], 0x34
+; GFX11-DL-NEXT: v_mul_hi_u32_u24_e32 v3, 0x900, v5
+; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, 0x900, v5
+; GFX11-DL-NEXT: v_add_nc_u32_e32 v9, v0, v5
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT: v_mad_u64_u32 v[5:6], null, 0x900, v0, v[2:3]
+; GFX11-DL-NEXT: v_mul_u32_u24_e32 v3, 0x48, v1
+; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 5, v9
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT: v_mad_u64_u32 v[7:8], null, 0x900, v9, v[3:4]
+; GFX11-DL-NEXT: v_mad_u64_u32 v[9:10], null, 0x48, v1, v[5:6]
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT: v_add_co_u32 v0, s0, v0, v1
+; GFX11-DL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-DL-NEXT: v_add_co_u32 v2, vcc_lo, s4, v7
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v8, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v4, vcc_lo, s4, v9
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v10, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v6, vcc_lo, s6, v7
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s7, v8, vcc_lo
+; GFX11-DL-NEXT: s_movk_i32 s4, 0xffe1
+; GFX11-DL-NEXT: s_mov_b32 s5, -1
+; GFX11-DL-NEXT: s_mov_b32 s6, 0
+; GFX11-DL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX11-DL-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-DL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX11-DL-NEXT: v_mov_b32_e32 v8, 0
+; GFX11-DL-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-DL-NEXT: .LBB17_3: ; %.preheader2
+; GFX11-DL-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX11-DL-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-DL-NEXT: v_add_co_u32 v9, vcc_lo, v4, s0
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v11, vcc_lo, v2, s0
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v13, vcc_lo, v6, s0
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
+; GFX11-DL-NEXT: s_clause 0x6
+; GFX11-DL-NEXT: global_load_i8 v15, v[11:12], off offset:1
+; GFX11-DL-NEXT: global_load_i8 v16, v[11:12], off offset:2
+; GFX11-DL-NEXT: global_load_i8 v17, v[11:12], off offset:3
+; GFX11-DL-NEXT: global_load_i8 v18, v[11:12], off offset:4
+; GFX11-DL-NEXT: global_load_i8 v19, v[11:12], off offset:5
+; GFX11-DL-NEXT: global_load_i8 v20, v[11:12], off offset:6
+; GFX11-DL-NEXT: global_load_i8 v11, v[11:12], off offset:7
+; GFX11-DL-NEXT: s_clause 0x1
+; GFX11-DL-NEXT: global_load_i8 v12, v[9:10], off
+; GFX11-DL-NEXT: global_load_i8 v9, v[9:10], off offset:8
+; GFX11-DL-NEXT: s_clause 0x8
+; GFX11-DL-NEXT: global_load_i8 v10, v[13:14], off
+; GFX11-DL-NEXT: global_load_i8 v21, v[13:14], off offset:1
+; GFX11-DL-NEXT: global_load_i8 v22, v[13:14], off offset:2
+; GFX11-DL-NEXT: global_load_i8 v23, v[13:14], off offset:3
+; GFX11-DL-NEXT: global_load_i8 v24, v[13:14], off offset:4
+; GFX11-DL-NEXT: global_load_i8 v25, v[13:14], off offset:5
+; GFX11-DL-NEXT: global_load_i8 v26, v[13:14], off offset:6
+; GFX11-DL-NEXT: global_load_i8 v27, v[13:14], off offset:7
+; GFX11-DL-NEXT: global_load_i8 v13, v[13:14], off offset:8
+; GFX11-DL-NEXT: s_add_u32 s0, s0, 9
+; GFX11-DL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-DL-NEXT: s_cmp_lg_u64 s[0:1], 0x48
+; GFX11-DL-NEXT: s_waitcnt vmcnt(9)
+; GFX11-DL-NEXT: v_perm_b32 v9, v9, v11, 0x4000c0c
+; GFX11-DL-NEXT: s_waitcnt vmcnt(8)
+; GFX11-DL-NEXT: v_mad_i32_i24 v8, v10, v12, v8
+; GFX11-DL-NEXT: v_perm_b32 v10, v16, v15, 0xc0c0400
+; GFX11-DL-NEXT: v_perm_b32 v12, v18, v17, 0x4000c0c
+; GFX11-DL-NEXT: s_waitcnt vmcnt(6)
+; GFX11-DL-NEXT: v_perm_b32 v14, v22, v21, 0xc0c0400
+; GFX11-DL-NEXT: s_waitcnt vmcnt(4)
+; GFX11-DL-NEXT: v_perm_b32 v15, v24, v23, 0x4000c0c
+; GFX11-DL-NEXT: v_perm_b32 v16, v20, v19, 0xc0c0400
+; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT: v_perm_b32 v11, v26, v25, 0xc0c0400
+; GFX11-DL-NEXT: v_or_b32_e32 v10, v12, v10
+; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT: v_perm_b32 v13, v13, v27, 0x4000c0c
+; GFX11-DL-NEXT: v_or_b32_e32 v12, v15, v14
+; GFX11-DL-NEXT: v_or_b32_e32 v9, v9, v16
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT: v_or_b32_e32 v11, v13, v11
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v8, v12, v10, v8 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v8, v11, v9, v8 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX11-DL-NEXT: ; %bb.4: ; %.110
+; GFX11-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX11-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[0:1]
+; GFX11-DL-NEXT: v_add_co_u32 v6, s0, 0x900, v6
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX11-DL-NEXT: v_add_co_u32 v9, vcc_lo, s2, v9
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 32
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v2, vcc_lo, 0x900, v2
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v4, vcc_lo, 0x900, v4
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-DL-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-DL-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-DL-NEXT: global_store_b32 v[9:10], v8, off
+; GFX11-DL-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-DL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-DL-NEXT: s_cbranch_execnz .LBB17_2
+; GFX11-DL-NEXT: .LBB17_5: ; %._crit_edge
+; GFX11-DL-NEXT: s_endpgm
+.entry:
+ %workitemx = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %sworkitemx = sext i32 %workitemx to i64
+ %workitemy = tail call i32 @llvm.amdgcn.workitem.id.y()
+ %sworkitemy = sext i32 %workitemy to i64
+ %workitemz = tail call i32 @llvm.amdgcn.workitem.id.z()
+ %sworkitemz = sext i32 %workitemz to i64
+ %ivtemp0 = add nsw i64 %sworkitemy, %sworkitemz
+ %ivtemp1 = shl nsw i64 %ivtemp0, 5
+ %iv = add nsw i64 %ivtemp1, %sworkitemx
+ %cmp = icmp slt i64 %sworkitemx, 2
+ br i1 %cmp, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %.entry, %.110
+ %phi = phi i64 [ %outerlimit, %.110 ], [ %iv, %.entry ]
+ %outptr = getelementptr i32, ptr addrspace(1) %inptr2, i64 %phi
+ %scalarmul = mul nsw i64 %phi, 72
+ br label %.preheader2
+
+.preheader2: ; preds = %.lr.ph, %.preheader2
+ %phi1 = phi i64 [ 0, %.lr.ph ], [ %limit, %.preheader2 ]
+ %.lcssa4.lcssa67 = phi i32 [ 0, %.lr.ph ], [ %ivadd9, %.preheader2 ]
+ %mul0 = mul nuw nsw i64 %phi1, 9
+ %scalaradd = add nsw i64 %mul0, %scalarmul
+ %gep10 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %scalaradd
+ %l10 = load i8, ptr addrspace(1) %gep10, align 1
+ %gep11 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %scalaradd
+ %l11 = load i8, ptr addrspace(1) %gep11, align 1
+ %op11 = sext i8 %l10 to i32
+ %op10 = sext i8 %l11 to i32
+ %mul1 = mul nsw i32 %op10, %op11
+ %ivadd1 = add i32 %mul1, %.lcssa4.lcssa67
+ %off2 = add nsw i64 %scalaradd, 1
+ %gep21 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off2
+ %l21 = load i8, ptr addrspace(1) %gep21, align 1
+ %gep20 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off2
+ %l20 = load i8, ptr addrspace(1) %gep20, align 1
+ %op21 = sext i8 %l21 to i32
+ %op20 = sext i8 %l20 to i32
+ %mul2 = mul nsw i32 %op20, %op21
+ %ivadd2 = add i32 %mul2, %ivadd1
+ %off3 = add nsw i64 %scalaradd, 2
+ %gep31 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off3
+ %l31 = load i8, ptr addrspace(1) %gep31, align 1
+ %gep30 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off3
+ %l30 = load i8, ptr addrspace(1) %gep30, align 1
+ %op31 = sext i8 %l31 to i32
+ %op30 = sext i8 %l30 to i32
+ %mul3 = mul nsw i32 %op30, %op31
+ %ivadd3 = add i32 %mul3, %ivadd2
+ %off4 = add nsw i64 %scalaradd, 3
+ %gep41 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off4
+ %l41 = load i8, ptr addrspace(1) %gep41, align 1
+ %gep40 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off4
+ %l40 = load i8, ptr addrspace(1) %gep40, align 1
+ %op41 = sext i8 %l41 to i32
+ %op40 = sext i8 %l40 to i32
+ %mul4 = mul nsw i32 %op40, %op41
+ %ivadd4 = add i32 %mul4, %ivadd3
+ %off5 = add nsw i64 %scalaradd, 4
+ %gep51 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off5
+ %l51 = load i8, ptr addrspace(1) %gep51, align 1
+ %gep50 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off5
+ %l50 = load i8, ptr addrspace(1) %gep50, align 1
+ %op51 = sext i8 %l51 to i32
+ %op50 = sext i8 %l50 to i32
+ %mul5 = mul nsw i32 %op50, %op51
+ %ivadd5 = add i32 %mul5, %ivadd4
+ %off6 = add nsw i64 %scalaradd, 5
+ %gep61 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off6
+ %l61 = load i8, ptr addrspace(1) %gep61, align 1
+ %gep60 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off6
+ %l60 = load i8, ptr addrspace(1) %gep60, align 1
+ %op61 = sext i8 %l61 to i32
+ %op60 = sext i8 %l60 to i32
+ %mul6 = mul nsw i32 %op60, %op61
+ %ivadd6 = add i32 %mul6, %ivadd5
+ %off7 = add nsw i64 %scalaradd, 6
+ %gep71 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off7
+ %l71 = load i8, ptr addrspace(1) %gep71, align 1
+ %gep70 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off7
+ %l70 = load i8, ptr addrspace(1) %gep70, align 1
+ %op71 = sext i8 %l71 to i32
+ %op70 = sext i8 %l70 to i32
+ %mul7 = mul nsw i32 %op70, %op71
+ %ivadd7 = add i32 %mul7, %ivadd6
+ %off8 = add nsw i64 %scalaradd, 7
+ %gep81 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off8
+ %l81 = load i8, ptr addrspace(1) %gep81, align 1
+ %gep80 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off8
+ %l80 = load i8, ptr addrspace(1) %gep80, align 1
+ %op81 = sext i8 %l81 to i32
+ %op80 = sext i8 %l80 to i32
+ %mul8 = mul nsw i32 %op80, %op81
+ %ivadd8 = add i32 %mul8, %ivadd7
+ %off9 = add nsw i64 %scalaradd, 8
+ %gep91 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off9
+ %l91 = load i8, ptr addrspace(1) %gep91, align 1
+ %gep90 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off9
+ %l90 = load i8, ptr addrspace(1) %gep90, align 1
+ %op91 = sext i8 %l91 to i32
+ %op90 = sext i8 %l90 to i32
+ %mul9 = mul nsw i32 %op90, %op91
+ %ivadd9 = add i32 %mul9, %ivadd8
+ %limit = add nuw nsw i64 %phi1, 1
+ %exitcond.not = icmp eq i64 %limit, 8
+ br i1 %exitcond.not, label %.110, label %.preheader2
+
+.110: ; preds = %.preheader2
+ store i32 %ivadd9, ptr addrspace(1) %outptr, align 4
+ %outerlimit = add nsw i64 %phi, 32
+ %outerexitcond = icmp slt i64 %phi, -30
+ br i1 %outerexitcond, label %.lr.ph, label %._crit_edge
+
+._crit_edge: ; preds = %.110, %.3
+ ret void
+}
+
+
declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare i32 @llvm.amdgcn.workitem.id.z()
>From 164e1a51f4d75722b328b8bc4c6ee20bf1be33e7 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 17:32:31 -0800
Subject: [PATCH 2/6] Fix test
Change-Id: Ifa2ee3caaf13bc563119f79a241c3231557d401f
---
llvm/test/CodeGen/AMDGPU/idot4-combine.ll | 116 ++++++++++++++++++++++
1 file changed, 116 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/idot4-combine.ll
diff --git a/llvm/test/CodeGen/AMDGPU/idot4-combine.ll b/llvm/test/CodeGen/AMDGPU/idot4-combine.ll
new file mode 100644
index 00000000000000..18920fd4e40a24
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/idot4-combine.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+; The first (A) operand of the v_dot4 is derived from the LHS of the mul chain (that is %l6080, %l7081, %l8082, %l9083).
+; These correspond to the 5th, 6th, 7th and 8th byte in the load %7.
+; Confirm that we are actually accessing these bytes.
+;
+; Previously, we used the dword offset from the corresponding byte in the second (B) operand.
+; The result was to access the 3rd byte of %7 instead of the 7th (i.e. a dword offset of 0 instead of 1).
+
+define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2, ptr addrspace(1) %outptr) local_unnamed_addr #0 {
+; GFX11-LABEL: ByteOffsetCorrectness:
+; GFX11: ; %bb.0: ; %.entry
+; GFX11-NEXT: v_bfe_u32 v2, v0, 20, 10
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c
+; GFX11-NEXT: v_bfe_u32 v6, v0, 10, 10
+; GFX11-NEXT: v_and_b32_e32 v7, 0x3ff, v0
+; GFX11-NEXT: v_mul_hi_u32_u24_e32 v1, 0x900, v2
+; GFX11-NEXT: v_mul_u32_u24_e32 v0, 0x900, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_nc_u32_e32 v8, v6, v2
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x900, v6, v[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: v_mul_hi_u32_u24_e32 v3, 0x48, v7
+; GFX11-NEXT: v_mul_u32_u24_e32 v2, 0x48, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, 0x900, v8, v[2:3]
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x48, v7, v[4:5]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s6, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_load_i8 v7, v[4:5], off offset:7
+; GFX11-NEXT: global_load_i8 v2, v[2:3], off offset:8
+; GFX11-NEXT: global_load_d16_b16 v6, v[4:5], off offset:5
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_i8 v3, v[0:1], off offset:8
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v0, v2, v7, 0x4000c0c
+; GFX11-NEXT: v_perm_b32 v2, v6, v6, 0xc0c0100
+; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x4030201
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0]
+; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
+.entry:
+ %ByteOffsetCorrectness.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %workitemx = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %sworkitemx = sext i32 %workitemx to i64
+ %workitemy = tail call i32 @llvm.amdgcn.workitem.id.y()
+ %sworkitemy = sext i32 %workitemy to i64
+ %workitemz = tail call i32 @llvm.amdgcn.workitem.id.z()
+ %sworkitemz = sext i32 %workitemz to i64
+ %ivtemp0 = add i64 %sworkitemy, %sworkitemz
+ %ivtemp1 = shl nsw i64 %ivtemp0, 5
+ %iv = add nsw i64 %ivtemp1, %sworkitemx
+ %0 = mul nsw i64 %ivtemp0, 2304
+ %1 = mul nsw i64 %sworkitemx, 72
+ %2 = add i64 %0, %1
+ %scevgep = getelementptr i8, ptr addrspace(1) %inptr0, i64 %2
+ %3 = mul nsw i64 %sworkitemy, 2304
+ %4 = mul nsw i64 %sworkitemz, 2304
+ %5 = add i64 %3, %4
+ %6 = add i64 %5, %1
+ %scevgep49 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %6
+ %scevgep55 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %2
+ %scevgep54 = getelementptr i8, ptr addrspace(1) %scevgep49, i64 0
+ %l10 = load i8, ptr addrspace(1) %scevgep54, align 1
+ %scevgep58 = getelementptr i8, ptr addrspace(1) %scevgep55, i64 0
+ %7 = load <9 x i8>, ptr addrspace(1) %scevgep58, align 1
+ %l6080 = extractelement <9 x i8> %7, i32 5
+ %l7081 = extractelement <9 x i8> %7, i32 6
+ %l8082 = extractelement <9 x i8> %7, i32 7
+ %l9083 = extractelement <9 x i8> %7, i32 8
+ %scevgep35 = getelementptr i8, ptr addrspace(1) %scevgep, i64 0
+ %scevgep36 = getelementptr i8, ptr addrspace(1) %scevgep35, i64 1
+ %8 = load <7 x i8>, ptr addrspace(1) %scevgep36, align 1
+ %l6188 = extractelement <7 x i8> %8, i32 4
+ %l7189 = extractelement <7 x i8> %8, i32 5
+ %l8190 = extractelement <7 x i8> %8, i32 6
+ %op61 = sext i8 %l6188 to i32
+ %op60 = sext i8 %l6080 to i32
+ %mul6 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op60, i32 %op61)
+ %ivadd6 = add i32 %mul6, 0
+ %op71 = sext i8 %l7189 to i32
+ %op70 = sext i8 %l7081 to i32
+ %mul7 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op70, i32 %op71)
+ %ivadd7 = add i32 %mul7, %ivadd6
+ %op81 = sext i8 %l8190 to i32
+ %op80 = sext i8 %l8082 to i32
+ %mul8 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op80, i32 %op81)
+ %ivadd8 = add i32 %mul8, %ivadd7
+ %scevgep53 = getelementptr i8, ptr addrspace(1) %scevgep54, i64 8
+ %l91 = load i8, ptr addrspace(1) %scevgep53, align 1
+ %op91 = sext i8 %l91 to i32
+ %op90 = sext i8 %l9083 to i32
+ %mul9 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op90, i32 %op91)
+ %ivadd9 = add i32 %mul9, %ivadd8
+ store i32 %ivadd9, ptr addrspace(1) %outptr, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare i32 @llvm.amdgcn.workitem.id.z()
>From 7d81864d9f68083be83ab8faf3497a88e6fc1763 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 17:36:41 -0800
Subject: [PATCH 3/6] Remove old test
Change-Id: I70cc33b3e3af22d276ede907d3cbf9a2132f6ce4
---
llvm/test/CodeGen/AMDGPU/idot4s.ll | 848 +----------------------------
1 file changed, 1 insertion(+), 847 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 15734094db42cd..17182b20bfba7d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3449,851 +3449,5 @@ entry:
ret void
}
-
-define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2) local_unnamed_addr {
-; GFX7-LABEL: ByteOffsetCorrectness:
-; GFX7: ; %bb.0: ; %.entry
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
-; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX7-NEXT: s_cbranch_execz .LBB17_5
-; GFX7-NEXT: ; %bb.1: ; %.lr.ph.preheader
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v2
-; GFX7-NEXT: v_mul_hi_u32_u24_e32 v2, 0x48, v0
-; GFX7-NEXT: v_mul_u32_u24_e32 v1, 0x48, v0
-; GFX7-NEXT: s_movk_i32 s0, 0x900
-; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
-; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v3, s0, v[1:2]
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 5, v3
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, v6, v0
-; GFX7-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, s11
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, s10, v4
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; GFX7-NEXT: v_mov_b32_e32 v6, s9
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, s8, v4
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_mov_b32 s2, 0
-; GFX7-NEXT: v_mov_b32_e32 v6, 0x48
-; GFX7-NEXT: s_movk_i32 s10, 0xffe1
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mov_b32_e32 v7, 0
-; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX7-NEXT: s_mov_b32 s11, -1
-; GFX7-NEXT: s_mov_b64 s[12:13], 0
-; GFX7-NEXT: .LBB17_2: ; %.lr.ph
-; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB17_3 Depth 2
-; GFX7-NEXT: v_mov_b32_e32 v8, 0
-; GFX7-NEXT: s_mov_b64 s[0:1], s[8:9]
-; GFX7-NEXT: .LBB17_3: ; %.preheader2
-; GFX7-NEXT: ; Parent Loop BB17_2 Depth=1
-; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT: buffer_load_sbyte v9, v[4:5], s[0:3], 0 addr64
-; GFX7-NEXT: buffer_load_sbyte v10, v[4:5], s[0:3], 0 addr64 offset:1
-; GFX7-NEXT: buffer_load_sbyte v11, v[4:5], s[0:3], 0 addr64 offset:2
-; GFX7-NEXT: buffer_load_sbyte v12, v[4:5], s[0:3], 0 addr64 offset:3
-; GFX7-NEXT: buffer_load_sbyte v13, v[4:5], s[0:3], 0 addr64 offset:4
-; GFX7-NEXT: buffer_load_sbyte v14, v[4:5], s[0:3], 0 addr64 offset:5
-; GFX7-NEXT: buffer_load_sbyte v15, v[4:5], s[0:3], 0 addr64 offset:6
-; GFX7-NEXT: buffer_load_sbyte v16, v[4:5], s[0:3], 0 addr64 offset:7
-; GFX7-NEXT: buffer_load_sbyte v17, v[4:5], s[0:3], 0 addr64 offset:8
-; GFX7-NEXT: buffer_load_sbyte v18, v[0:1], s[0:3], 0 addr64
-; GFX7-NEXT: buffer_load_sbyte v19, v[0:1], s[0:3], 0 addr64 offset:1
-; GFX7-NEXT: buffer_load_sbyte v20, v[0:1], s[0:3], 0 addr64 offset:2
-; GFX7-NEXT: buffer_load_sbyte v21, v[0:1], s[0:3], 0 addr64 offset:3
-; GFX7-NEXT: buffer_load_sbyte v22, v[0:1], s[0:3], 0 addr64 offset:4
-; GFX7-NEXT: buffer_load_sbyte v23, v[0:1], s[0:3], 0 addr64 offset:5
-; GFX7-NEXT: buffer_load_sbyte v24, v[0:1], s[0:3], 0 addr64 offset:6
-; GFX7-NEXT: buffer_load_sbyte v25, v[0:1], s[0:3], 0 addr64 offset:7
-; GFX7-NEXT: buffer_load_sbyte v26, v[0:1], s[0:3], 0 addr64 offset:8
-; GFX7-NEXT: s_add_u32 s0, s0, 9
-; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, s[0:1], v[6:7]
-; GFX7-NEXT: s_and_b64 vcc, exec, vcc
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_mad_i32_i24 v8, v18, v9, v8
-; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: v_mad_i32_i24 v8, v19, v10, v8
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_mad_i32_i24 v8, v20, v11, v8
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_mad_i32_i24 v8, v21, v12, v8
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mad_i32_i24 v8, v22, v13, v8
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mad_i32_i24 v8, v23, v14, v8
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mad_i32_i24 v8, v24, v15, v8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mad_i32_i24 v8, v25, v16, v8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_i32_i24 v8, v26, v17, v8
-; GFX7-NEXT: s_cbranch_vccnz .LBB17_3
-; GFX7-NEXT: ; %bb.4: ; %.110
-; GFX7-NEXT: ; in Loop: Header=BB17_2 Depth=1
-; GFX7-NEXT: v_lshl_b64 v[9:10], v[2:3], 2
-; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], v[2:3]
-; GFX7-NEXT: buffer_store_dword v8, v[9:10], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v2
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x900, v4
-; GFX7-NEXT: v_mov_b32_e32 v2, v8
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GFX7-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13]
-; GFX7-NEXT: v_mov_b32_e32 v3, v9
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[12:13]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_2
-; GFX7-NEXT: .LBB17_5: ; %._crit_edge
-; GFX7-NEXT: s_endpgm
-;
-; GFX8-LABEL: ByteOffsetCorrectness:
-; GFX8: ; %bb.0: ; %.entry
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
-; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB17_5
-; GFX8-NEXT: ; %bb.1: ; %.lr.ph.preheader
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v3, v2
-; GFX8-NEXT: s_movk_i32 s0, 0x900
-; GFX8-NEXT: v_mul_hi_u32_u24_e32 v4, 0x900, v3
-; GFX8-NEXT: v_mul_u32_u24_e32 v3, 0x900, v3
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, s0, v[3:4]
-; GFX8-NEXT: s_movk_i32 s0, 0x48
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s0, v[1:2]
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 5, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
-; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, 0, vcc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, s7
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, s6, v2
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v4, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v2
-; GFX8-NEXT: s_movk_i32 s4, 0xffe1
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v4, v3, vcc
-; GFX8-NEXT: s_mov_b32 s5, -1
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: .LBB17_2: ; %.lr.ph
-; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB17_3 Depth 2
-; GFX8-NEXT: v_mov_b32_e32 v10, 0
-; GFX8-NEXT: s_mov_b64 s[0:1], 0
-; GFX8-NEXT: .LBB17_3: ; %.preheader2
-; GFX8-NEXT: ; Parent Loop BB17_2 Depth=1
-; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v8
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v9, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v6
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v7, v3, vcc
-; GFX8-NEXT: flat_load_sbyte v11, v[4:5]
-; GFX8-NEXT: flat_load_sbyte v12, v[2:3]
-; GFX8-NEXT: s_add_u32 s0, s0, 9
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0x48
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v12, v12, v11, v10
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v4
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v4
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v4
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v4
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v4
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v4
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v4
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 8, v4
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v2
-; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_sbyte v4, v[4:5]
-; GFX8-NEXT: flat_load_sbyte v2, v[2:3]
-; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_mad_i32_i24 v10, v10, v13, v12
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v10, v2, v4, v10
-; GFX8-NEXT: s_cbranch_scc1 .LBB17_3
-; GFX8-NEXT: ; %bb.4: ; %.110
-; GFX8-NEXT: ; in Loop: Header=BB17_2 Depth=1
-; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT: flat_store_dword v[2:3], v10
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x900, v6
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x900, v8
-; GFX8-NEXT: v_mov_b32_e32 v0, v2
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB17_2
-; GFX8-NEXT: .LBB17_5: ; %._crit_edge
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-NODL-LABEL: ByteOffsetCorrectness:
-; GFX9-NODL: ; %bb.0: ; %.entry
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NODL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
-; GFX9-NODL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9-NODL-NEXT: s_cbranch_execz .LBB17_5
-; GFX9-NODL-NEXT: ; %bb.1: ; %.lr.ph.preheader
-; GFX9-NODL-NEXT: v_add_u32_e32 v10, v3, v2
-; GFX9-NODL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
-; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
-; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 5, v10
-; GFX9-NODL-NEXT: s_movk_i32 s3, 0x900
-; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2
-; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0
-; GFX9-NODL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
-; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0
-; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0
-; GFX9-NODL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
-; GFX9-NODL-NEXT: s_movk_i32 s2, 0x48
-; GFX9-NODL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
-; GFX9-NODL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
-; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6
-; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
-; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6
-; GFX9-NODL-NEXT: s_movk_i32 s6, 0xffe1
-; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
-; GFX9-NODL-NEXT: s_mov_b32 s7, -1
-; GFX9-NODL-NEXT: s_mov_b64 s[8:9], 0
-; GFX9-NODL-NEXT: .LBB17_2: ; %.lr.ph
-; GFX9-NODL-NEXT: ; =>This Loop Header: Depth=1
-; GFX9-NODL-NEXT: ; Child Loop BB17_3 Depth 2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-NODL-NEXT: s_mov_b64 s[10:11], 0
-; GFX9-NODL-NEXT: .LBB17_3: ; %.preheader2
-; GFX9-NODL-NEXT: ; Parent Loop BB17_2 Depth=1
-; GFX9-NODL-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v12, s11
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s10, v6
-; GFX9-NODL-NEXT: v_add_co_u32_e64 v11, s[0:1], s10, v0
-; GFX9-NODL-NEXT: v_add_co_u32_e64 v13, s[2:3], s10, v2
-; GFX9-NODL-NEXT: v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
-; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
-; GFX9-NODL-NEXT: v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
-; GFX9-NODL-NEXT: global_load_sbyte v15, v[13:14], off
-; GFX9-NODL-NEXT: global_load_sbyte v16, v[11:12], off offset:1
-; GFX9-NODL-NEXT: global_load_sbyte v17, v[11:12], off offset:2
-; GFX9-NODL-NEXT: global_load_sbyte v18, v[11:12], off offset:3
-; GFX9-NODL-NEXT: global_load_sbyte v19, v[11:12], off offset:4
-; GFX9-NODL-NEXT: global_load_sbyte v20, v[11:12], off offset:5
-; GFX9-NODL-NEXT: global_load_sbyte v21, v[11:12], off offset:6
-; GFX9-NODL-NEXT: global_load_sbyte v22, v[11:12], off offset:7
-; GFX9-NODL-NEXT: global_load_sbyte v23, v[9:10], off
-; GFX9-NODL-NEXT: global_load_sbyte v24, v[9:10], off offset:1
-; GFX9-NODL-NEXT: global_load_sbyte v25, v[9:10], off offset:2
-; GFX9-NODL-NEXT: global_load_sbyte v26, v[9:10], off offset:3
-; GFX9-NODL-NEXT: global_load_sbyte v27, v[9:10], off offset:4
-; GFX9-NODL-NEXT: global_load_sbyte v28, v[9:10], off offset:5
-; GFX9-NODL-NEXT: global_load_sbyte v29, v[9:10], off offset:6
-; GFX9-NODL-NEXT: ; kill: killed $vgpr11 killed $vgpr12
-; GFX9-NODL-NEXT: global_load_sbyte v11, v[9:10], off offset:7
-; GFX9-NODL-NEXT: global_load_sbyte v12, v[13:14], off offset:8
-; GFX9-NODL-NEXT: global_load_sbyte v30, v[9:10], off offset:8
-; GFX9-NODL-NEXT: s_add_u32 s10, s10, 9
-; GFX9-NODL-NEXT: s_addc_u32 s11, s11, 0
-; GFX9-NODL-NEXT: s_cmp_lg_u64 s[10:11], 0x48
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(9)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v23, v15, v8
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(8)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v24, v16, v8
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v25, v17, v8
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v26, v18, v8
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v27, v19, v8
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v28, v20, v8
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v29, v21, v8
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v11, v22, v8
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v30, v12, v8
-; GFX9-NODL-NEXT: s_cbranch_scc1 .LBB17_3
-; GFX9-NODL-NEXT: ; %bb.4: ; %.110
-; GFX9-NODL-NEXT: ; in Loop: Header=BB17_2 Depth=1
-; GFX9-NODL-NEXT: v_lshlrev_b64 v[9:10], 2, v[4:5]
-; GFX9-NODL-NEXT: v_mov_b32_e32 v11, s5
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s4, v9
-; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
-; GFX9-NODL-NEXT: global_store_dword v[9:10], v8, off
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v8, vcc, 32, v4
-; GFX9-NODL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, 0x900, v0
-; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, 0x900, v2
-; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NODL-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[4:5]
-; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, 0x900, v6
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, v8
-; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NODL-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-NODL-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX9-NODL-NEXT: s_cbranch_execnz .LBB17_2
-; GFX9-NODL-NEXT: .LBB17_5: ; %._crit_edge
-; GFX9-NODL-NEXT: s_endpgm
-;
-; GFX9-DL-LABEL: ByteOffsetCorrectness:
-; GFX9-DL: ; %bb.0: ; %.entry
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
-; GFX9-DL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9-DL-NEXT: s_cbranch_execz .LBB17_5
-; GFX9-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
-; GFX9-DL-NEXT: v_add_u32_e32 v10, v3, v2
-; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
-; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
-; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 5, v10
-; GFX9-DL-NEXT: s_movk_i32 s3, 0x900
-; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2
-; GFX9-DL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0
-; GFX9-DL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
-; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0
-; GFX9-DL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
-; GFX9-DL-NEXT: s_movk_i32 s2, 0x48
-; GFX9-DL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
-; GFX9-DL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-DL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6
-; GFX9-DL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
-; GFX9-DL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
-; GFX9-DL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6
-; GFX9-DL-NEXT: s_movk_i32 s8, 0xffe1
-; GFX9-DL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
-; GFX9-DL-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-DL-NEXT: s_mov_b32 s12, 0xc0c0400
-; GFX9-DL-NEXT: s_mov_b32 s9, -1
-; GFX9-DL-NEXT: s_mov_b32 s13, 0x4000c0c
-; GFX9-DL-NEXT: .LBB17_2: ; %.lr.ph
-; GFX9-DL-NEXT: ; =>This Loop Header: Depth=1
-; GFX9-DL-NEXT: ; Child Loop BB17_3 Depth 2
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-DL-NEXT: s_mov_b64 s[10:11], 0
-; GFX9-DL-NEXT: .LBB17_3: ; %.preheader2
-; GFX9-DL-NEXT: ; Parent Loop BB17_2 Depth=1
-; GFX9-DL-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX9-DL-NEXT: v_mov_b32_e32 v12, s11
-; GFX9-DL-NEXT: v_add_co_u32_e32 v9, vcc, s10, v6
-; GFX9-DL-NEXT: v_add_co_u32_e64 v11, s[0:1], s10, v0
-; GFX9-DL-NEXT: v_add_co_u32_e64 v13, s[2:3], s10, v2
-; GFX9-DL-NEXT: v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
-; GFX9-DL-NEXT: v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
-; GFX9-DL-NEXT: v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
-; GFX9-DL-NEXT: global_load_sbyte v15, v[11:12], off offset:1
-; GFX9-DL-NEXT: global_load_sbyte v16, v[11:12], off offset:2
-; GFX9-DL-NEXT: global_load_sbyte v17, v[11:12], off offset:3
-; GFX9-DL-NEXT: global_load_sbyte v18, v[11:12], off offset:4
-; GFX9-DL-NEXT: global_load_sbyte v19, v[11:12], off offset:5
-; GFX9-DL-NEXT: global_load_sbyte v20, v[11:12], off offset:6
-; GFX9-DL-NEXT: global_load_sbyte v21, v[11:12], off offset:7
-; GFX9-DL-NEXT: global_load_sbyte v22, v[13:14], off
-; GFX9-DL-NEXT: global_load_sbyte v23, v[13:14], off offset:8
-; GFX9-DL-NEXT: global_load_sbyte v24, v[9:10], off
-; GFX9-DL-NEXT: global_load_sbyte v25, v[9:10], off offset:1
-; GFX9-DL-NEXT: global_load_sbyte v26, v[9:10], off offset:2
-; GFX9-DL-NEXT: global_load_sbyte v27, v[9:10], off offset:3
-; GFX9-DL-NEXT: global_load_sbyte v28, v[9:10], off offset:4
-; GFX9-DL-NEXT: global_load_sbyte v29, v[9:10], off offset:5
-; GFX9-DL-NEXT: ; kill: killed $vgpr13 killed $vgpr14
-; GFX9-DL-NEXT: ; kill: killed $vgpr11 killed $vgpr12
-; GFX9-DL-NEXT: global_load_sbyte v11, v[9:10], off offset:6
-; GFX9-DL-NEXT: global_load_sbyte v12, v[9:10], off offset:7
-; GFX9-DL-NEXT: global_load_sbyte v13, v[9:10], off offset:8
-; GFX9-DL-NEXT: s_add_u32 s10, s10, 9
-; GFX9-DL-NEXT: s_addc_u32 s11, s11, 0
-; GFX9-DL-NEXT: s_cmp_lg_u64 s[10:11], 0x48
-; GFX9-DL-NEXT: s_waitcnt vmcnt(16)
-; GFX9-DL-NEXT: v_perm_b32 v9, v16, v15, s12
-; GFX9-DL-NEXT: s_waitcnt vmcnt(14)
-; GFX9-DL-NEXT: v_perm_b32 v10, v18, v17, s13
-; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v9
-; GFX9-DL-NEXT: s_waitcnt vmcnt(12)
-; GFX9-DL-NEXT: v_perm_b32 v16, v20, v19, s12
-; GFX9-DL-NEXT: s_waitcnt vmcnt(9)
-; GFX9-DL-NEXT: v_perm_b32 v17, v23, v21, s13
-; GFX9-DL-NEXT: s_waitcnt vmcnt(8)
-; GFX9-DL-NEXT: v_mad_i32_i24 v8, v24, v22, v8
-; GFX9-DL-NEXT: s_waitcnt vmcnt(6)
-; GFX9-DL-NEXT: v_perm_b32 v14, v26, v25, s12
-; GFX9-DL-NEXT: s_waitcnt vmcnt(4)
-; GFX9-DL-NEXT: v_perm_b32 v15, v28, v27, s13
-; GFX9-DL-NEXT: v_or_b32_e32 v10, v15, v14
-; GFX9-DL-NEXT: v_dot4_i32_i8 v8, v10, v9, v8
-; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_perm_b32 v11, v11, v29, s12
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v12, v13, v12, s13
-; GFX9-DL-NEXT: v_or_b32_e32 v13, v17, v16
-; GFX9-DL-NEXT: v_or_b32_e32 v11, v12, v11
-; GFX9-DL-NEXT: v_dot4_i32_i8 v8, v11, v13, v8
-; GFX9-DL-NEXT: s_cbranch_scc1 .LBB17_3
-; GFX9-DL-NEXT: ; %bb.4: ; %.110
-; GFX9-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1
-; GFX9-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[4:5]
-; GFX9-DL-NEXT: v_mov_b32_e32 v11, s5
-; GFX9-DL-NEXT: v_add_co_u32_e32 v9, vcc, s4, v9
-; GFX9-DL-NEXT: v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
-; GFX9-DL-NEXT: global_store_dword v[9:10], v8, off
-; GFX9-DL-NEXT: v_add_co_u32_e32 v8, vcc, 32, v4
-; GFX9-DL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-DL-NEXT: v_add_co_u32_e32 v0, vcc, 0x900, v0
-; GFX9-DL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-DL-NEXT: v_add_co_u32_e32 v2, vcc, 0x900, v2
-; GFX9-DL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-DL-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], v[4:5]
-; GFX9-DL-NEXT: v_add_co_u32_e32 v6, vcc, 0x900, v6
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, v8
-; GFX9-DL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-DL-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, v9
-; GFX9-DL-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX9-DL-NEXT: s_cbranch_execnz .LBB17_2
-; GFX9-DL-NEXT: .LBB17_5: ; %._crit_edge
-; GFX9-DL-NEXT: s_endpgm
-;
-; GFX10-DL-LABEL: ByteOffsetCorrectness:
-; GFX10-DL: ; %bb.0: ; %.entry
-; GFX10-DL-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT: v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1]
-; GFX10-DL-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX10-DL-NEXT: s_cbranch_execz .LBB17_5
-; GFX10-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
-; GFX10-DL-NEXT: v_mul_hi_u32_u24_e32 v5, 0x900, v2
-; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, 0x900, v2
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v6, v3, v2
-; GFX10-DL-NEXT: v_mul_hi_u32_u24_e32 v2, 0x48, v0
-; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, 0x48, v0
-; GFX10-DL-NEXT: s_movk_i32 s2, 0xffe1
-; GFX10-DL-NEXT: v_mad_u64_u32 v[3:4], s0, 0x900, v3, v[4:5]
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 5, v6
-; GFX10-DL-NEXT: v_mad_u64_u32 v[6:7], s0, 0x900, v6, v[1:2]
-; GFX10-DL-NEXT: s_mov_b32 s3, -1
-; GFX10-DL-NEXT: s_mov_b32 s6, 0
-; GFX10-DL-NEXT: v_mad_u64_u32 v[4:5], s0, 0x48, v0, v[3:4]
-; GFX10-DL-NEXT: v_add_co_u32 v0, s0, v8, v0
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add_co_u32 v2, vcc_lo, s8, v6
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s9, v7, vcc_lo
-; GFX10-DL-NEXT: v_add_co_u32 v4, vcc_lo, s8, v4
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s9, v5, vcc_lo
-; GFX10-DL-NEXT: v_add_co_u32 v6, vcc_lo, s10, v6
-; GFX10-DL-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, 0, s0
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s11, v7, vcc_lo
-; GFX10-DL-NEXT: .LBB17_2: ; %.lr.ph
-; GFX10-DL-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-DL-NEXT: ; Child Loop BB17_3 Depth 2
-; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-DL-NEXT: s_mov_b64 s[0:1], 0
-; GFX10-DL-NEXT: .LBB17_3: ; %.preheader2
-; GFX10-DL-NEXT: ; Parent Loop BB17_2 Depth=1
-; GFX10-DL-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX10-DL-NEXT: v_add_co_u32 v9, vcc_lo, v4, s0
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
-; GFX10-DL-NEXT: v_add_co_u32 v11, vcc_lo, v2, s0
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
-; GFX10-DL-NEXT: v_add_co_u32 v13, vcc_lo, v6, s0
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
-; GFX10-DL-NEXT: s_clause 0x6
-; GFX10-DL-NEXT: global_load_sbyte v15, v[11:12], off offset:1
-; GFX10-DL-NEXT: global_load_sbyte v16, v[11:12], off offset:2
-; GFX10-DL-NEXT: global_load_sbyte v17, v[11:12], off offset:3
-; GFX10-DL-NEXT: global_load_sbyte v18, v[11:12], off offset:4
-; GFX10-DL-NEXT: global_load_sbyte v19, v[11:12], off offset:5
-; GFX10-DL-NEXT: global_load_sbyte v20, v[11:12], off offset:6
-; GFX10-DL-NEXT: global_load_sbyte v21, v[11:12], off offset:7
-; GFX10-DL-NEXT: s_clause 0x1
-; GFX10-DL-NEXT: global_load_sbyte v22, v[9:10], off
-; GFX10-DL-NEXT: global_load_sbyte v23, v[9:10], off offset:8
-; GFX10-DL-NEXT: s_clause 0x8
-; GFX10-DL-NEXT: global_load_sbyte v24, v[13:14], off
-; GFX10-DL-NEXT: global_load_sbyte v25, v[13:14], off offset:1
-; GFX10-DL-NEXT: global_load_sbyte v26, v[13:14], off offset:2
-; GFX10-DL-NEXT: global_load_sbyte v27, v[13:14], off offset:3
-; GFX10-DL-NEXT: global_load_sbyte v28, v[13:14], off offset:4
-; GFX10-DL-NEXT: global_load_sbyte v29, v[13:14], off offset:5
-; GFX10-DL-NEXT: ; meta instruction
-; GFX10-DL-NEXT: ; meta instruction
-; GFX10-DL-NEXT: global_load_sbyte v9, v[13:14], off offset:6
-; GFX10-DL-NEXT: global_load_sbyte v10, v[13:14], off offset:7
-; GFX10-DL-NEXT: global_load_sbyte v11, v[13:14], off offset:8
-; GFX10-DL-NEXT: s_add_u32 s0, s0, 9
-; GFX10-DL-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-DL-NEXT: s_cmp_lg_u64 s[0:1], 0x48
-; GFX10-DL-NEXT: s_waitcnt vmcnt(16)
-; GFX10-DL-NEXT: v_perm_b32 v12, v16, v15, 0xc0c0400
-; GFX10-DL-NEXT: s_waitcnt vmcnt(14)
-; GFX10-DL-NEXT: v_perm_b32 v13, v18, v17, 0x4000c0c
-; GFX10-DL-NEXT: s_waitcnt vmcnt(12)
-; GFX10-DL-NEXT: v_perm_b32 v16, v20, v19, 0xc0c0400
-; GFX10-DL-NEXT: s_waitcnt vmcnt(9)
-; GFX10-DL-NEXT: v_perm_b32 v17, v23, v21, 0x4000c0c
-; GFX10-DL-NEXT: s_waitcnt vmcnt(8)
-; GFX10-DL-NEXT: v_mad_i32_i24 v8, v24, v22, v8
-; GFX10-DL-NEXT: s_waitcnt vmcnt(6)
-; GFX10-DL-NEXT: v_perm_b32 v14, v26, v25, 0xc0c0400
-; GFX10-DL-NEXT: s_waitcnt vmcnt(4)
-; GFX10-DL-NEXT: v_perm_b32 v15, v28, v27, 0x4000c0c
-; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT: v_perm_b32 v9, v9, v29, 0xc0c0400
-; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_perm_b32 v10, v11, v10, 0x4000c0c
-; GFX10-DL-NEXT: v_or_b32_e32 v11, v13, v12
-; GFX10-DL-NEXT: v_or_b32_e32 v12, v15, v14
-; GFX10-DL-NEXT: v_or_b32_e32 v13, v17, v16
-; GFX10-DL-NEXT: v_or_b32_e32 v9, v10, v9
-; GFX10-DL-NEXT: v_dot4c_i32_i8 v8, v12, v11
-; GFX10-DL-NEXT: v_dot4c_i32_i8 v8, v9, v13
-; GFX10-DL-NEXT: s_cbranch_scc1 .LBB17_3
-; GFX10-DL-NEXT: ; %bb.4: ; %.110
-; GFX10-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1
-; GFX10-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[0:1]
-; GFX10-DL-NEXT: v_add_co_u32 v6, s0, 0x900, v6
-; GFX10-DL-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
-; GFX10-DL-NEXT: v_add_co_u32 v9, vcc_lo, s4, v9
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s5, v10, vcc_lo
-; GFX10-DL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 32
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
-; GFX10-DL-NEXT: v_add_co_u32 v2, vcc_lo, 0x900, v2
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-DL-NEXT: v_add_co_u32 v4, vcc_lo, 0x900, v4
-; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
-; GFX10-DL-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, v11
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, v12
-; GFX10-DL-NEXT: global_store_dword v[9:10], v8, off
-; GFX10-DL-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
-; GFX10-DL-NEXT: s_cbranch_execnz .LBB17_2
-; GFX10-DL-NEXT: .LBB17_5: ; %._crit_edge
-; GFX10-DL-NEXT: s_endpgm
-;
-; GFX11-DL-LABEL: ByteOffsetCorrectness:
-; GFX11-DL: ; %bb.0: ; %.entry
-; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX11-DL-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_cmpx_gt_i64_e32 2, v[1:2]
-; GFX11-DL-NEXT: s_cbranch_execz .LBB17_5
-; GFX11-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
-; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 20, 10
-; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX11-DL-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-DL-NEXT: v_mul_hi_u32_u24_e32 v4, 0x48, v1
-; GFX11-DL-NEXT: s_load_b64 s[2:3], s[2:3], 0x34
-; GFX11-DL-NEXT: v_mul_hi_u32_u24_e32 v3, 0x900, v5
-; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, 0x900, v5
-; GFX11-DL-NEXT: v_add_nc_u32_e32 v9, v0, v5
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT: v_mad_u64_u32 v[5:6], null, 0x900, v0, v[2:3]
-; GFX11-DL-NEXT: v_mul_u32_u24_e32 v3, 0x48, v1
-; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 5, v9
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-NEXT: v_mad_u64_u32 v[7:8], null, 0x900, v9, v[3:4]
-; GFX11-DL-NEXT: v_mad_u64_u32 v[9:10], null, 0x48, v1, v[5:6]
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT: v_add_co_u32 v0, s0, v0, v1
-; GFX11-DL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0
-; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-DL-NEXT: v_add_co_u32 v2, vcc_lo, s4, v7
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v8, vcc_lo
-; GFX11-DL-NEXT: v_add_co_u32 v4, vcc_lo, s4, v9
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v10, vcc_lo
-; GFX11-DL-NEXT: v_add_co_u32 v6, vcc_lo, s6, v7
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s7, v8, vcc_lo
-; GFX11-DL-NEXT: s_movk_i32 s4, 0xffe1
-; GFX11-DL-NEXT: s_mov_b32 s5, -1
-; GFX11-DL-NEXT: s_mov_b32 s6, 0
-; GFX11-DL-NEXT: .LBB17_2: ; %.lr.ph
-; GFX11-DL-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-DL-NEXT: ; Child Loop BB17_3 Depth 2
-; GFX11-DL-NEXT: v_mov_b32_e32 v8, 0
-; GFX11-DL-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-DL-NEXT: .LBB17_3: ; %.preheader2
-; GFX11-DL-NEXT: ; Parent Loop BB17_2 Depth=1
-; GFX11-DL-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-DL-NEXT: v_add_co_u32 v9, vcc_lo, v4, s0
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
-; GFX11-DL-NEXT: v_add_co_u32 v11, vcc_lo, v2, s0
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
-; GFX11-DL-NEXT: v_add_co_u32 v13, vcc_lo, v6, s0
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
-; GFX11-DL-NEXT: s_clause 0x6
-; GFX11-DL-NEXT: global_load_i8 v15, v[11:12], off offset:1
-; GFX11-DL-NEXT: global_load_i8 v16, v[11:12], off offset:2
-; GFX11-DL-NEXT: global_load_i8 v17, v[11:12], off offset:3
-; GFX11-DL-NEXT: global_load_i8 v18, v[11:12], off offset:4
-; GFX11-DL-NEXT: global_load_i8 v19, v[11:12], off offset:5
-; GFX11-DL-NEXT: global_load_i8 v20, v[11:12], off offset:6
-; GFX11-DL-NEXT: global_load_i8 v11, v[11:12], off offset:7
-; GFX11-DL-NEXT: s_clause 0x1
-; GFX11-DL-NEXT: global_load_i8 v12, v[9:10], off
-; GFX11-DL-NEXT: global_load_i8 v9, v[9:10], off offset:8
-; GFX11-DL-NEXT: s_clause 0x8
-; GFX11-DL-NEXT: global_load_i8 v10, v[13:14], off
-; GFX11-DL-NEXT: global_load_i8 v21, v[13:14], off offset:1
-; GFX11-DL-NEXT: global_load_i8 v22, v[13:14], off offset:2
-; GFX11-DL-NEXT: global_load_i8 v23, v[13:14], off offset:3
-; GFX11-DL-NEXT: global_load_i8 v24, v[13:14], off offset:4
-; GFX11-DL-NEXT: global_load_i8 v25, v[13:14], off offset:5
-; GFX11-DL-NEXT: global_load_i8 v26, v[13:14], off offset:6
-; GFX11-DL-NEXT: global_load_i8 v27, v[13:14], off offset:7
-; GFX11-DL-NEXT: global_load_i8 v13, v[13:14], off offset:8
-; GFX11-DL-NEXT: s_add_u32 s0, s0, 9
-; GFX11-DL-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-DL-NEXT: s_cmp_lg_u64 s[0:1], 0x48
-; GFX11-DL-NEXT: s_waitcnt vmcnt(9)
-; GFX11-DL-NEXT: v_perm_b32 v9, v9, v11, 0x4000c0c
-; GFX11-DL-NEXT: s_waitcnt vmcnt(8)
-; GFX11-DL-NEXT: v_mad_i32_i24 v8, v10, v12, v8
-; GFX11-DL-NEXT: v_perm_b32 v10, v16, v15, 0xc0c0400
-; GFX11-DL-NEXT: v_perm_b32 v12, v18, v17, 0x4000c0c
-; GFX11-DL-NEXT: s_waitcnt vmcnt(6)
-; GFX11-DL-NEXT: v_perm_b32 v14, v22, v21, 0xc0c0400
-; GFX11-DL-NEXT: s_waitcnt vmcnt(4)
-; GFX11-DL-NEXT: v_perm_b32 v15, v24, v23, 0x4000c0c
-; GFX11-DL-NEXT: v_perm_b32 v16, v20, v19, 0xc0c0400
-; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-NEXT: v_perm_b32 v11, v26, v25, 0xc0c0400
-; GFX11-DL-NEXT: v_or_b32_e32 v10, v12, v10
-; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT: v_perm_b32 v13, v13, v27, 0x4000c0c
-; GFX11-DL-NEXT: v_or_b32_e32 v12, v15, v14
-; GFX11-DL-NEXT: v_or_b32_e32 v9, v9, v16
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT: v_or_b32_e32 v11, v13, v11
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v8, v12, v10, v8 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v8, v11, v9, v8 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: s_cbranch_scc1 .LBB17_3
-; GFX11-DL-NEXT: ; %bb.4: ; %.110
-; GFX11-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1
-; GFX11-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[0:1]
-; GFX11-DL-NEXT: v_add_co_u32 v6, s0, 0x900, v6
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
-; GFX11-DL-NEXT: v_add_co_u32 v9, vcc_lo, s2, v9
-; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo
-; GFX11-DL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 32
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
-; GFX11-DL-NEXT: v_add_co_u32 v2, vcc_lo, 0x900, v2
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX11-DL-NEXT: v_add_co_u32 v4, vcc_lo, 0x900, v4
-; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
-; GFX11-DL-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-DL-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-DL-NEXT: global_store_b32 v[9:10], v8, off
-; GFX11-DL-NEXT: s_or_b32 s6, vcc_lo, s6
-; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-DL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
-; GFX11-DL-NEXT: s_cbranch_execnz .LBB17_2
-; GFX11-DL-NEXT: .LBB17_5: ; %._crit_edge
-; GFX11-DL-NEXT: s_endpgm
-.entry:
- %workitemx = tail call i32 @llvm.amdgcn.workitem.id.x()
- %sworkitemx = sext i32 %workitemx to i64
- %workitemy = tail call i32 @llvm.amdgcn.workitem.id.y()
- %sworkitemy = sext i32 %workitemy to i64
- %workitemz = tail call i32 @llvm.amdgcn.workitem.id.z()
- %sworkitemz = sext i32 %workitemz to i64
- %ivtemp0 = add nsw i64 %sworkitemy, %sworkitemz
- %ivtemp1 = shl nsw i64 %ivtemp0, 5
- %iv = add nsw i64 %ivtemp1, %sworkitemx
- %cmp = icmp slt i64 %sworkitemx, 2
- br i1 %cmp, label %.lr.ph, label %._crit_edge
-
-.lr.ph: ; preds = %.entry, %.110
- %phi = phi i64 [ %outerlimit, %.110 ], [ %iv, %.entry ]
- %outptr = getelementptr i32, ptr addrspace(1) %inptr2, i64 %phi
- %scalarmul = mul nsw i64 %phi, 72
- br label %.preheader2
-
-.preheader2: ; preds = %.lr.ph, %.preheader2
- %phi1 = phi i64 [ 0, %.lr.ph ], [ %limit, %.preheader2 ]
- %.lcssa4.lcssa67 = phi i32 [ 0, %.lr.ph ], [ %ivadd9, %.preheader2 ]
- %mul0 = mul nuw nsw i64 %phi1, 9
- %scalaradd = add nsw i64 %mul0, %scalarmul
- %gep10 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %scalaradd
- %l10 = load i8, ptr addrspace(1) %gep10, align 1
- %gep11 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %scalaradd
- %l11 = load i8, ptr addrspace(1) %gep11, align 1
- %op11 = sext i8 %l10 to i32
- %op10 = sext i8 %l11 to i32
- %mul1 = mul nsw i32 %op10, %op11
- %ivadd1 = add i32 %mul1, %.lcssa4.lcssa67
- %off2 = add nsw i64 %scalaradd, 1
- %gep21 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off2
- %l21 = load i8, ptr addrspace(1) %gep21, align 1
- %gep20 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off2
- %l20 = load i8, ptr addrspace(1) %gep20, align 1
- %op21 = sext i8 %l21 to i32
- %op20 = sext i8 %l20 to i32
- %mul2 = mul nsw i32 %op20, %op21
- %ivadd2 = add i32 %mul2, %ivadd1
- %off3 = add nsw i64 %scalaradd, 2
- %gep31 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off3
- %l31 = load i8, ptr addrspace(1) %gep31, align 1
- %gep30 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off3
- %l30 = load i8, ptr addrspace(1) %gep30, align 1
- %op31 = sext i8 %l31 to i32
- %op30 = sext i8 %l30 to i32
- %mul3 = mul nsw i32 %op30, %op31
- %ivadd3 = add i32 %mul3, %ivadd2
- %off4 = add nsw i64 %scalaradd, 3
- %gep41 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off4
- %l41 = load i8, ptr addrspace(1) %gep41, align 1
- %gep40 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off4
- %l40 = load i8, ptr addrspace(1) %gep40, align 1
- %op41 = sext i8 %l41 to i32
- %op40 = sext i8 %l40 to i32
- %mul4 = mul nsw i32 %op40, %op41
- %ivadd4 = add i32 %mul4, %ivadd3
- %off5 = add nsw i64 %scalaradd, 4
- %gep51 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off5
- %l51 = load i8, ptr addrspace(1) %gep51, align 1
- %gep50 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off5
- %l50 = load i8, ptr addrspace(1) %gep50, align 1
- %op51 = sext i8 %l51 to i32
- %op50 = sext i8 %l50 to i32
- %mul5 = mul nsw i32 %op50, %op51
- %ivadd5 = add i32 %mul5, %ivadd4
- %off6 = add nsw i64 %scalaradd, 5
- %gep61 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off6
- %l61 = load i8, ptr addrspace(1) %gep61, align 1
- %gep60 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off6
- %l60 = load i8, ptr addrspace(1) %gep60, align 1
- %op61 = sext i8 %l61 to i32
- %op60 = sext i8 %l60 to i32
- %mul6 = mul nsw i32 %op60, %op61
- %ivadd6 = add i32 %mul6, %ivadd5
- %off7 = add nsw i64 %scalaradd, 6
- %gep71 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off7
- %l71 = load i8, ptr addrspace(1) %gep71, align 1
- %gep70 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off7
- %l70 = load i8, ptr addrspace(1) %gep70, align 1
- %op71 = sext i8 %l71 to i32
- %op70 = sext i8 %l70 to i32
- %mul7 = mul nsw i32 %op70, %op71
- %ivadd7 = add i32 %mul7, %ivadd6
- %off8 = add nsw i64 %scalaradd, 7
- %gep81 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off8
- %l81 = load i8, ptr addrspace(1) %gep81, align 1
- %gep80 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off8
- %l80 = load i8, ptr addrspace(1) %gep80, align 1
- %op81 = sext i8 %l81 to i32
- %op80 = sext i8 %l80 to i32
- %mul8 = mul nsw i32 %op80, %op81
- %ivadd8 = add i32 %mul8, %ivadd7
- %off9 = add nsw i64 %scalaradd, 8
- %gep91 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off9
- %l91 = load i8, ptr addrspace(1) %gep91, align 1
- %gep90 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off9
- %l90 = load i8, ptr addrspace(1) %gep90, align 1
- %op91 = sext i8 %l91 to i32
- %op90 = sext i8 %l90 to i32
- %mul9 = mul nsw i32 %op90, %op91
- %ivadd9 = add i32 %mul9, %ivadd8
- %limit = add nuw nsw i64 %phi1, 1
- %exitcond.not = icmp eq i64 %limit, 8
- br i1 %exitcond.not, label %.110, label %.preheader2
-
-.110: ; preds = %.preheader2
- store i32 %ivadd9, ptr addrspace(1) %outptr, align 4
- %outerlimit = add nsw i64 %phi, 32
- %outerexitcond = icmp slt i64 %phi, -30
- br i1 %outerexitcond, label %.lr.ph, label %._crit_edge
-
-._crit_edge: ; preds = %.110, %.3
- ret void
-}
-
-
declare i32 @llvm.amdgcn.workitem.id.x()
-declare i32 @llvm.amdgcn.workitem.id.y()
-declare i32 @llvm.amdgcn.workitem.id.z()
+
>From 521267d82726275f7dc460200326dabd60510b6e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 17:37:53 -0800
Subject: [PATCH 4/6] Remove newline
Change-Id: I670d272205b5431a1fc434abd94550747c49c15e
---
llvm/test/CodeGen/AMDGPU/idot4s.ll | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 17182b20bfba7d..4262ec1057924a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3450,4 +3450,3 @@ entry:
}
declare i32 @llvm.amdgcn.workitem.id.x()
-
>From 5343dd6ca665c02d95228c90b4a9dbecefa8cec8 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 17:38:33 -0800
Subject: [PATCH 5/6] Add newline
Change-Id: If26584f3e25c5a1e4ec33ca71ac1d331eae24103
---
llvm/test/CodeGen/AMDGPU/idot4s.ll | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 4262ec1057924a..108d85e024ad76 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3449,4 +3449,5 @@ entry:
ret void
}
+
declare i32 @llvm.amdgcn.workitem.id.x()
>From d41b3ae895057287866ca5637008bfc2fa928566 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes at amd.com>
Date: Wed, 6 Nov 2024 18:53:14 -0800
Subject: [PATCH 6/6] Update llvm/test/CodeGen/AMDGPU/idot4-combine.ll
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/test/CodeGen/AMDGPU/idot4-combine.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/idot4-combine.ll b/llvm/test/CodeGen/AMDGPU/idot4-combine.ll
index 18920fd4e40a24..4aa8bb4dd47d58 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4-combine.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --start-before=amdgpu-isel < %s | FileCheck -check-prefixes=GFX11 %s
; The first (A) operand of the v_dot4 is derived from the LHS of the mul chain (that is %l6080, %l7081, %l8082, %l9083).
; These correspond to the 5th, 6th, 7th and 8th byte in the load %7.
More information about the llvm-commits
mailing list