[llvm] [AMDGPU] Fix typo in v_dot4 combine (PR #115224)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 6 17:38:49 PST 2024


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/115224

>From dd740f1f451a741324e1cfb4c58a5a3af76a2b91 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 13:59:38 -0800
Subject: [PATCH 1/5] [AMDGPU] Fix typo in v_dot4 combine

Change-Id: Ifc201f58eddd8f8994690bacbf34f446ccf2a790
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   2 +-
 llvm/test/CodeGen/AMDGPU/idot4s.ll        | 846 ++++++++++++++++++++++
 2 files changed, 847 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1a962e68c587c7..419414e5bd993d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14011,7 +14011,7 @@ static void placeSources(ByteProvider<SDValue> &Src0,
   Src0s.push_back(
       {*Src0.Src,
        ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
-       Src1.SrcOffset / 4});
+       Src0.SrcOffset / 4});
   Src1s.push_back(
       {*Src1.Src,
        ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 108d85e024ad76..15734094db42cd 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3450,4 +3450,850 @@ entry:
 }
 
 
+define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2) local_unnamed_addr {
+; GFX7-LABEL: ByteOffsetCorrectness:
+; GFX7:       ; %bb.0: ; %.entry
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX7-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7-NEXT:    s_cbranch_execz .LBB17_5
+; GFX7-NEXT:  ; %bb.1: ; %.lr.ph.preheader
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v2
+; GFX7-NEXT:    v_mul_hi_u32_u24_e32 v2, 0x48, v0
+; GFX7-NEXT:    v_mul_u32_u24_e32 v1, 0x48, v0
+; GFX7-NEXT:    s_movk_i32 s0, 0x900
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v3, s0, v[1:2]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 5, v3
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v6, v0
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s10, v4
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v6, s9
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, s8, v4
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, 0x48
+; GFX7-NEXT:    s_movk_i32 s10, 0xffe1
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mov_b32_e32 v7, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7-NEXT:    s_mov_b32 s11, -1
+; GFX7-NEXT:    s_mov_b64 s[12:13], 0
+; GFX7-NEXT:  .LBB17_2: ; %.lr.ph
+; GFX7-NEXT:    ; =>This Loop Header: Depth=1
+; GFX7-NEXT:    ; Child Loop BB17_3 Depth 2
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX7-NEXT:  .LBB17_3: ; %.preheader2
+; GFX7-NEXT:    ; Parent Loop BB17_2 Depth=1
+; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT:    buffer_load_sbyte v9, v[4:5], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_sbyte v10, v[4:5], s[0:3], 0 addr64 offset:1
+; GFX7-NEXT:    buffer_load_sbyte v11, v[4:5], s[0:3], 0 addr64 offset:2
+; GFX7-NEXT:    buffer_load_sbyte v12, v[4:5], s[0:3], 0 addr64 offset:3
+; GFX7-NEXT:    buffer_load_sbyte v13, v[4:5], s[0:3], 0 addr64 offset:4
+; GFX7-NEXT:    buffer_load_sbyte v14, v[4:5], s[0:3], 0 addr64 offset:5
+; GFX7-NEXT:    buffer_load_sbyte v15, v[4:5], s[0:3], 0 addr64 offset:6
+; GFX7-NEXT:    buffer_load_sbyte v16, v[4:5], s[0:3], 0 addr64 offset:7
+; GFX7-NEXT:    buffer_load_sbyte v17, v[4:5], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT:    buffer_load_sbyte v18, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    buffer_load_sbyte v19, v[0:1], s[0:3], 0 addr64 offset:1
+; GFX7-NEXT:    buffer_load_sbyte v20, v[0:1], s[0:3], 0 addr64 offset:2
+; GFX7-NEXT:    buffer_load_sbyte v21, v[0:1], s[0:3], 0 addr64 offset:3
+; GFX7-NEXT:    buffer_load_sbyte v22, v[0:1], s[0:3], 0 addr64 offset:4
+; GFX7-NEXT:    buffer_load_sbyte v23, v[0:1], s[0:3], 0 addr64 offset:5
+; GFX7-NEXT:    buffer_load_sbyte v24, v[0:1], s[0:3], 0 addr64 offset:6
+; GFX7-NEXT:    buffer_load_sbyte v25, v[0:1], s[0:3], 0 addr64 offset:7
+; GFX7-NEXT:    buffer_load_sbyte v26, v[0:1], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT:    s_add_u32 s0, s0, 9
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, s[0:1], v[6:7]
+; GFX7-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_mad_i32_i24 v8, v18, v9, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_mad_i32_i24 v8, v19, v10, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_mad_i32_i24 v8, v20, v11, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_mad_i32_i24 v8, v21, v12, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_mad_i32_i24 v8, v22, v13, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_mad_i32_i24 v8, v23, v14, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_mad_i32_i24 v8, v24, v15, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mad_i32_i24 v8, v25, v16, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mad_i32_i24 v8, v26, v17, v8
+; GFX7-NEXT:    s_cbranch_vccnz .LBB17_3
+; GFX7-NEXT:  ; %bb.4: ; %.110
+; GFX7-NEXT:    ; in Loop: Header=BB17_2 Depth=1
+; GFX7-NEXT:    v_lshl_b64 v[9:10], v[2:3], 2
+; GFX7-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], v[2:3]
+; GFX7-NEXT:    buffer_store_dword v8, v[9:10], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v2
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x900, v4
+; GFX7-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX7-NEXT:    s_or_b64 s[12:13], s[0:1], s[12:13]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v9
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[12:13]
+; GFX7-NEXT:    s_cbranch_execnz .LBB17_2
+; GFX7-NEXT:  .LBB17_5: ; %._crit_edge
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: ByteOffsetCorrectness:
+; GFX8:       ; %bb.0: ; %.entry
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB17_5
+; GFX8-NEXT:  ; %bb.1: ; %.lr.ph.preheader
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v3, v2
+; GFX8-NEXT:    s_movk_i32 s0, 0x900
+; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v4, 0x900, v3
+; GFX8-NEXT:    v_mul_u32_u24_e32 v3, 0x900, v3
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], v2, s0, v[3:4]
+; GFX8-NEXT:    s_movk_i32 s0, 0x48
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x34
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, s0, v[1:2]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 5, v5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
+; GFX8-NEXT:    v_addc_u32_e64 v1, s[0:1], 0, 0, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v4, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v2
+; GFX8-NEXT:    s_movk_i32 s4, 0xffe1
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v4, v3, vcc
+; GFX8-NEXT:    s_mov_b32 s5, -1
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB17_2: ; %.lr.ph
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB17_3 Depth 2
+; GFX8-NEXT:    v_mov_b32_e32 v10, 0
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:  .LBB17_3: ; %.preheader2
+; GFX8-NEXT:    ; Parent Loop BB17_2 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s0, v8
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v9, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v6
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT:    flat_load_sbyte v11, v[4:5]
+; GFX8-NEXT:    flat_load_sbyte v12, v[2:3]
+; GFX8-NEXT:    s_add_u32 s0, s0, 9
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0x48
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v12, v12, v11, v10
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v4
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v2
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 2, v4
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 3, v4
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 3, v2
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 4, v4
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 5, v4
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 5, v2
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 6, v4
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 7, v4
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 7, v2
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
+; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_sbyte v4, v[4:5]
+; GFX8-NEXT:    flat_load_sbyte v2, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_mad_i32_i24 v10, v10, v13, v12
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_i32_i24 v10, v2, v4, v10
+; GFX8-NEXT:    s_cbranch_scc1 .LBB17_3
+; GFX8-NEXT:  ; %bb.4: ; %.110
+; GFX8-NEXT:    ; in Loop: Header=BB17_2 Depth=1
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT:    flat_store_dword v[2:3], v10
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x900, v6
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x900, v8
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GFX8-NEXT:    s_or_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_2
+; GFX8-NEXT:  .LBB17_5: ; %._crit_edge
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-NODL-LABEL: ByteOffsetCorrectness:
+; GFX9-NODL:       ; %bb.0: ; %.entry
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NODL-NEXT:    v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX9-NODL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NODL-NEXT:    s_cbranch_execz .LBB17_5
+; GFX9-NODL-NEXT:  ; %bb.1: ; %.lr.ph.preheader
+; GFX9-NODL-NEXT:    v_add_u32_e32 v10, v3, v2
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v1, 5, v10
+; GFX9-NODL-NEXT:    s_movk_i32 s3, 0x900
+; GFX9-NODL-NEXT:    v_mul_hi_u32_u24_e32 v9, 0x900, v2
+; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v8, 0x900, v2
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v4, vcc, v1, v0
+; GFX9-NODL-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
+; GFX9-NODL-NEXT:    v_mul_hi_u32_u24_e32 v7, 0x48, v0
+; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v6, 0x48, v0
+; GFX9-NODL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
+; GFX9-NODL-NEXT:    s_movk_i32 s2, 0x48
+; GFX9-NODL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
+; GFX9-NODL-NEXT:    v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v0, vcc, s8, v6
+; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v6, vcc, s10, v6
+; GFX9-NODL-NEXT:    s_movk_i32 s6, 0xffe1
+; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX9-NODL-NEXT:    s_mov_b32 s7, -1
+; GFX9-NODL-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-NODL-NEXT:  .LBB17_2: ; %.lr.ph
+; GFX9-NODL-NEXT:    ; =>This Loop Header: Depth=1
+; GFX9-NODL-NEXT:    ; Child Loop BB17_3 Depth 2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NODL-NEXT:    s_mov_b64 s[10:11], 0
+; GFX9-NODL-NEXT:  .LBB17_3: ; %.preheader2
+; GFX9-NODL-NEXT:    ; Parent Loop BB17_2 Depth=1
+; GFX9-NODL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v12, s11
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v9, vcc, s10, v6
+; GFX9-NODL-NEXT:    v_add_co_u32_e64 v11, s[0:1], s10, v0
+; GFX9-NODL-NEXT:    v_add_co_u32_e64 v13, s[2:3], s10, v2
+; GFX9-NODL-NEXT:    v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
+; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
+; GFX9-NODL-NEXT:    v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
+; GFX9-NODL-NEXT:    global_load_sbyte v15, v[13:14], off
+; GFX9-NODL-NEXT:    global_load_sbyte v16, v[11:12], off offset:1
+; GFX9-NODL-NEXT:    global_load_sbyte v17, v[11:12], off offset:2
+; GFX9-NODL-NEXT:    global_load_sbyte v18, v[11:12], off offset:3
+; GFX9-NODL-NEXT:    global_load_sbyte v19, v[11:12], off offset:4
+; GFX9-NODL-NEXT:    global_load_sbyte v20, v[11:12], off offset:5
+; GFX9-NODL-NEXT:    global_load_sbyte v21, v[11:12], off offset:6
+; GFX9-NODL-NEXT:    global_load_sbyte v22, v[11:12], off offset:7
+; GFX9-NODL-NEXT:    global_load_sbyte v23, v[9:10], off
+; GFX9-NODL-NEXT:    global_load_sbyte v24, v[9:10], off offset:1
+; GFX9-NODL-NEXT:    global_load_sbyte v25, v[9:10], off offset:2
+; GFX9-NODL-NEXT:    global_load_sbyte v26, v[9:10], off offset:3
+; GFX9-NODL-NEXT:    global_load_sbyte v27, v[9:10], off offset:4
+; GFX9-NODL-NEXT:    global_load_sbyte v28, v[9:10], off offset:5
+; GFX9-NODL-NEXT:    global_load_sbyte v29, v[9:10], off offset:6
+; GFX9-NODL-NEXT:    ; kill: killed $vgpr11 killed $vgpr12
+; GFX9-NODL-NEXT:    global_load_sbyte v11, v[9:10], off offset:7
+; GFX9-NODL-NEXT:    global_load_sbyte v12, v[13:14], off offset:8
+; GFX9-NODL-NEXT:    global_load_sbyte v30, v[9:10], off offset:8
+; GFX9-NODL-NEXT:    s_add_u32 s10, s10, 9
+; GFX9-NODL-NEXT:    s_addc_u32 s11, s11, 0
+; GFX9-NODL-NEXT:    s_cmp_lg_u64 s[10:11], 0x48
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v23, v15, v8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v24, v16, v8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v25, v17, v8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v26, v18, v8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v27, v19, v8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v28, v20, v8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v29, v21, v8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v11, v22, v8
+; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v30, v12, v8
+; GFX9-NODL-NEXT:    s_cbranch_scc1 .LBB17_3
+; GFX9-NODL-NEXT:  ; %bb.4: ; %.110
+; GFX9-NODL-NEXT:    ; in Loop: Header=BB17_2 Depth=1
+; GFX9-NODL-NEXT:    v_lshlrev_b64 v[9:10], 2, v[4:5]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v9, vcc, s4, v9
+; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
+; GFX9-NODL-NEXT:    global_store_dword v[9:10], v8, off
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v8, vcc, 32, v4
+; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x900, v0
+; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v2, vcc, 0x900, v2
+; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NODL-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], v[4:5]
+; GFX9-NODL-NEXT:    v_add_co_u32_e32 v6, vcc, 0x900, v6
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-NODL-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-NODL-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NODL-NEXT:    s_cbranch_execnz .LBB17_2
+; GFX9-NODL-NEXT:  .LBB17_5: ; %._crit_edge
+; GFX9-NODL-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: ByteOffsetCorrectness:
+; GFX9-DL:       ; %bb.0: ; %.entry
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-DL-NEXT:    v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX9-DL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DL-NEXT:    s_cbranch_execz .LBB17_5
+; GFX9-DL-NEXT:  ; %bb.1: ; %.lr.ph.preheader
+; GFX9-DL-NEXT:    v_add_u32_e32 v10, v3, v2
+; GFX9-DL-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v1, 5, v10
+; GFX9-DL-NEXT:    s_movk_i32 s3, 0x900
+; GFX9-DL-NEXT:    v_mul_hi_u32_u24_e32 v9, 0x900, v2
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v8, 0x900, v2
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v4, vcc, v1, v0
+; GFX9-DL-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
+; GFX9-DL-NEXT:    v_mul_hi_u32_u24_e32 v7, 0x48, v0
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v6, 0x48, v0
+; GFX9-DL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
+; GFX9-DL-NEXT:    s_movk_i32 s2, 0x48
+; GFX9-DL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
+; GFX9-DL-NEXT:    v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v0, vcc, s8, v6
+; GFX9-DL-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-DL-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v6, vcc, s10, v6
+; GFX9-DL-NEXT:    s_movk_i32 s8, 0xffe1
+; GFX9-DL-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX9-DL-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-DL-NEXT:    s_mov_b32 s12, 0xc0c0400
+; GFX9-DL-NEXT:    s_mov_b32 s9, -1
+; GFX9-DL-NEXT:    s_mov_b32 s13, 0x4000c0c
+; GFX9-DL-NEXT:  .LBB17_2: ; %.lr.ph
+; GFX9-DL-NEXT:    ; =>This Loop Header: Depth=1
+; GFX9-DL-NEXT:    ; Child Loop BB17_3 Depth 2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-DL-NEXT:    s_mov_b64 s[10:11], 0
+; GFX9-DL-NEXT:  .LBB17_3: ; %.preheader2
+; GFX9-DL-NEXT:    ; Parent Loop BB17_2 Depth=1
+; GFX9-DL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v12, s11
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v9, vcc, s10, v6
+; GFX9-DL-NEXT:    v_add_co_u32_e64 v11, s[0:1], s10, v0
+; GFX9-DL-NEXT:    v_add_co_u32_e64 v13, s[2:3], s10, v2
+; GFX9-DL-NEXT:    v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
+; GFX9-DL-NEXT:    v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
+; GFX9-DL-NEXT:    v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
+; GFX9-DL-NEXT:    global_load_sbyte v15, v[11:12], off offset:1
+; GFX9-DL-NEXT:    global_load_sbyte v16, v[11:12], off offset:2
+; GFX9-DL-NEXT:    global_load_sbyte v17, v[11:12], off offset:3
+; GFX9-DL-NEXT:    global_load_sbyte v18, v[11:12], off offset:4
+; GFX9-DL-NEXT:    global_load_sbyte v19, v[11:12], off offset:5
+; GFX9-DL-NEXT:    global_load_sbyte v20, v[11:12], off offset:6
+; GFX9-DL-NEXT:    global_load_sbyte v21, v[11:12], off offset:7
+; GFX9-DL-NEXT:    global_load_sbyte v22, v[13:14], off
+; GFX9-DL-NEXT:    global_load_sbyte v23, v[13:14], off offset:8
+; GFX9-DL-NEXT:    global_load_sbyte v24, v[9:10], off
+; GFX9-DL-NEXT:    global_load_sbyte v25, v[9:10], off offset:1
+; GFX9-DL-NEXT:    global_load_sbyte v26, v[9:10], off offset:2
+; GFX9-DL-NEXT:    global_load_sbyte v27, v[9:10], off offset:3
+; GFX9-DL-NEXT:    global_load_sbyte v28, v[9:10], off offset:4
+; GFX9-DL-NEXT:    global_load_sbyte v29, v[9:10], off offset:5
+; GFX9-DL-NEXT:    ; kill: killed $vgpr13 killed $vgpr14
+; GFX9-DL-NEXT:    ; kill: killed $vgpr11 killed $vgpr12
+; GFX9-DL-NEXT:    global_load_sbyte v11, v[9:10], off offset:6
+; GFX9-DL-NEXT:    global_load_sbyte v12, v[9:10], off offset:7
+; GFX9-DL-NEXT:    global_load_sbyte v13, v[9:10], off offset:8
+; GFX9-DL-NEXT:    s_add_u32 s10, s10, 9
+; GFX9-DL-NEXT:    s_addc_u32 s11, s11, 0
+; GFX9-DL-NEXT:    s_cmp_lg_u64 s[10:11], 0x48
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-DL-NEXT:    v_perm_b32 v9, v16, v15, s12
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-DL-NEXT:    v_perm_b32 v10, v18, v17, s13
+; GFX9-DL-NEXT:    v_or_b32_e32 v9, v10, v9
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-DL-NEXT:    v_perm_b32 v16, v20, v19, s12
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-DL-NEXT:    v_perm_b32 v17, v23, v21, s13
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-DL-NEXT:    v_mad_i32_i24 v8, v24, v22, v8
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-DL-NEXT:    v_perm_b32 v14, v26, v25, s12
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-DL-NEXT:    v_perm_b32 v15, v28, v27, s13
+; GFX9-DL-NEXT:    v_or_b32_e32 v10, v15, v14
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v8, v10, v9, v8
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-DL-NEXT:    v_perm_b32 v11, v11, v29, s12
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_perm_b32 v12, v13, v12, s13
+; GFX9-DL-NEXT:    v_or_b32_e32 v13, v17, v16
+; GFX9-DL-NEXT:    v_or_b32_e32 v11, v12, v11
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v8, v11, v13, v8
+; GFX9-DL-NEXT:    s_cbranch_scc1 .LBB17_3
+; GFX9-DL-NEXT:  ; %bb.4: ; %.110
+; GFX9-DL-NEXT:    ; in Loop: Header=BB17_2 Depth=1
+; GFX9-DL-NEXT:    v_lshlrev_b64 v[9:10], 2, v[4:5]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v11, s5
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v9, vcc, s4, v9
+; GFX9-DL-NEXT:    v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
+; GFX9-DL-NEXT:    global_store_dword v[9:10], v8, off
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v8, vcc, 32, v4
+; GFX9-DL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x900, v0
+; GFX9-DL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v2, vcc, 0x900, v2
+; GFX9-DL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-DL-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[8:9], v[4:5]
+; GFX9-DL-NEXT:    v_add_co_u32_e32 v6, vcc, 0x900, v6
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-DL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-DL-NEXT:    s_or_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-DL-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX9-DL-NEXT:    s_cbranch_execnz .LBB17_2
+; GFX9-DL-NEXT:  .LBB17_5: ; %._crit_edge
+; GFX9-DL-NEXT:    s_endpgm
+;
+; GFX10-DL-LABEL: ByteOffsetCorrectness:
+; GFX10-DL:       ; %bb.0: ; %.entry
+; GFX10-DL-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-DL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1]
+; GFX10-DL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX10-DL-NEXT:    s_cbranch_execz .LBB17_5
+; GFX10-DL-NEXT:  ; %bb.1: ; %.lr.ph.preheader
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX10-DL-NEXT:    v_mul_hi_u32_u24_e32 v5, 0x900, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v4, 0x900, v2
+; GFX10-DL-NEXT:    v_add_nc_u32_e32 v6, v3, v2
+; GFX10-DL-NEXT:    v_mul_hi_u32_u24_e32 v2, 0x48, v0
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, 0x48, v0
+; GFX10-DL-NEXT:    s_movk_i32 s2, 0xffe1
+; GFX10-DL-NEXT:    v_mad_u64_u32 v[3:4], s0, 0x900, v3, v[4:5]
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v8, 5, v6
+; GFX10-DL-NEXT:    v_mad_u64_u32 v[6:7], s0, 0x900, v6, v[1:2]
+; GFX10-DL-NEXT:    s_mov_b32 s3, -1
+; GFX10-DL-NEXT:    s_mov_b32 s6, 0
+; GFX10-DL-NEXT:    v_mad_u64_u32 v[4:5], s0, 0x48, v0, v[3:4]
+; GFX10-DL-NEXT:    v_add_co_u32 v0, s0, v8, v0
+; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT:    v_add_co_u32 v2, vcc_lo, s8, v6
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s9, v7, vcc_lo
+; GFX10-DL-NEXT:    v_add_co_u32 v4, vcc_lo, s8, v4
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s9, v5, vcc_lo
+; GFX10-DL-NEXT:    v_add_co_u32 v6, vcc_lo, s10, v6
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, 0, s0
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s11, v7, vcc_lo
+; GFX10-DL-NEXT:  .LBB17_2: ; %.lr.ph
+; GFX10-DL-NEXT:    ; =>This Loop Header: Depth=1
+; GFX10-DL-NEXT:    ; Child Loop BB17_3 Depth 2
+; GFX10-DL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-DL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-DL-NEXT:  .LBB17_3: ; %.preheader2
+; GFX10-DL-NEXT:    ; Parent Loop BB17_2 Depth=1
+; GFX10-DL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX10-DL-NEXT:    v_add_co_u32 v9, vcc_lo, v4, s0
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
+; GFX10-DL-NEXT:    v_add_co_u32 v11, vcc_lo, v2, s0
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
+; GFX10-DL-NEXT:    v_add_co_u32 v13, vcc_lo, v6, s0
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
+; GFX10-DL-NEXT:    s_clause 0x6
+; GFX10-DL-NEXT:    global_load_sbyte v15, v[11:12], off offset:1
+; GFX10-DL-NEXT:    global_load_sbyte v16, v[11:12], off offset:2
+; GFX10-DL-NEXT:    global_load_sbyte v17, v[11:12], off offset:3
+; GFX10-DL-NEXT:    global_load_sbyte v18, v[11:12], off offset:4
+; GFX10-DL-NEXT:    global_load_sbyte v19, v[11:12], off offset:5
+; GFX10-DL-NEXT:    global_load_sbyte v20, v[11:12], off offset:6
+; GFX10-DL-NEXT:    global_load_sbyte v21, v[11:12], off offset:7
+; GFX10-DL-NEXT:    s_clause 0x1
+; GFX10-DL-NEXT:    global_load_sbyte v22, v[9:10], off
+; GFX10-DL-NEXT:    global_load_sbyte v23, v[9:10], off offset:8
+; GFX10-DL-NEXT:    s_clause 0x8
+; GFX10-DL-NEXT:    global_load_sbyte v24, v[13:14], off
+; GFX10-DL-NEXT:    global_load_sbyte v25, v[13:14], off offset:1
+; GFX10-DL-NEXT:    global_load_sbyte v26, v[13:14], off offset:2
+; GFX10-DL-NEXT:    global_load_sbyte v27, v[13:14], off offset:3
+; GFX10-DL-NEXT:    global_load_sbyte v28, v[13:14], off offset:4
+; GFX10-DL-NEXT:    global_load_sbyte v29, v[13:14], off offset:5
+; GFX10-DL-NEXT:    ; meta instruction
+; GFX10-DL-NEXT:    ; meta instruction
+; GFX10-DL-NEXT:    global_load_sbyte v9, v[13:14], off offset:6
+; GFX10-DL-NEXT:    global_load_sbyte v10, v[13:14], off offset:7
+; GFX10-DL-NEXT:    global_load_sbyte v11, v[13:14], off offset:8
+; GFX10-DL-NEXT:    s_add_u32 s0, s0, 9
+; GFX10-DL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-DL-NEXT:    s_cmp_lg_u64 s[0:1], 0x48
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(16)
+; GFX10-DL-NEXT:    v_perm_b32 v12, v16, v15, 0xc0c0400
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(14)
+; GFX10-DL-NEXT:    v_perm_b32 v13, v18, v17, 0x4000c0c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(12)
+; GFX10-DL-NEXT:    v_perm_b32 v16, v20, v19, 0xc0c0400
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-DL-NEXT:    v_perm_b32 v17, v23, v21, 0x4000c0c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-DL-NEXT:    v_mad_i32_i24 v8, v24, v22, v8
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-DL-NEXT:    v_perm_b32 v14, v26, v25, 0xc0c0400
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-DL-NEXT:    v_perm_b32 v15, v28, v27, 0x4000c0c
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-DL-NEXT:    v_perm_b32 v9, v9, v29, 0xc0c0400
+; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT:    v_perm_b32 v10, v11, v10, 0x4000c0c
+; GFX10-DL-NEXT:    v_or_b32_e32 v11, v13, v12
+; GFX10-DL-NEXT:    v_or_b32_e32 v12, v15, v14
+; GFX10-DL-NEXT:    v_or_b32_e32 v13, v17, v16
+; GFX10-DL-NEXT:    v_or_b32_e32 v9, v10, v9
+; GFX10-DL-NEXT:    v_dot4c_i32_i8 v8, v12, v11
+; GFX10-DL-NEXT:    v_dot4c_i32_i8 v8, v9, v13
+; GFX10-DL-NEXT:    s_cbranch_scc1 .LBB17_3
+; GFX10-DL-NEXT:  ; %bb.4: ; %.110
+; GFX10-DL-NEXT:    ; in Loop: Header=BB17_2 Depth=1
+; GFX10-DL-NEXT:    v_lshlrev_b64 v[9:10], 2, v[0:1]
+; GFX10-DL-NEXT:    v_add_co_u32 v6, s0, 0x900, v6
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX10-DL-NEXT:    v_add_co_u32 v9, vcc_lo, s4, v9
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s5, v10, vcc_lo
+; GFX10-DL-NEXT:    v_add_co_u32 v11, vcc_lo, v0, 32
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
+; GFX10-DL-NEXT:    v_add_co_u32 v2, vcc_lo, 0x900, v2
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-DL-NEXT:    v_add_co_u32 v4, vcc_lo, 0x900, v4
+; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX10-DL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, v11
+; GFX10-DL-NEXT:    v_mov_b32_e32 v1, v12
+; GFX10-DL-NEXT:    global_store_dword v[9:10], v8, off
+; GFX10-DL-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-DL-NEXT:    s_cbranch_execnz .LBB17_2
+; GFX10-DL-NEXT:  .LBB17_5: ; %._crit_edge
+; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: ByteOffsetCorrectness:
+; GFX11-DL:       ; %bb.0: ; %.entry
+; GFX11-DL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-DL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_cmpx_gt_i64_e32 2, v[1:2]
+; GFX11-DL-NEXT:    s_cbranch_execz .LBB17_5
+; GFX11-DL-NEXT:  ; %bb.1: ; %.lr.ph.preheader
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 20, 10
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-DL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX11-DL-NEXT:    v_mul_hi_u32_u24_e32 v4, 0x48, v1
+; GFX11-DL-NEXT:    s_load_b64 s[2:3], s[2:3], 0x34
+; GFX11-DL-NEXT:    v_mul_hi_u32_u24_e32 v3, 0x900, v5
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v2, 0x900, v5
+; GFX11-DL-NEXT:    v_add_nc_u32_e32 v9, v0, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u64_u32 v[5:6], null, 0x900, v0, v[2:3]
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v3, 0x48, v1
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 5, v9
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_mad_u64_u32 v[7:8], null, 0x900, v9, v[3:4]
+; GFX11-DL-NEXT:    v_mad_u64_u32 v[9:10], null, 0x48, v1, v[5:6]
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_co_u32 v0, s0, v0, v1
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_add_co_u32 v2, vcc_lo, s4, v7
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s5, v8, vcc_lo
+; GFX11-DL-NEXT:    v_add_co_u32 v4, vcc_lo, s4, v9
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s5, v10, vcc_lo
+; GFX11-DL-NEXT:    v_add_co_u32 v6, vcc_lo, s6, v7
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s7, v8, vcc_lo
+; GFX11-DL-NEXT:    s_movk_i32 s4, 0xffe1
+; GFX11-DL-NEXT:    s_mov_b32 s5, -1
+; GFX11-DL-NEXT:    s_mov_b32 s6, 0
+; GFX11-DL-NEXT:  .LBB17_2: ; %.lr.ph
+; GFX11-DL-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-DL-NEXT:    ; Child Loop BB17_3 Depth 2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-DL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-DL-NEXT:  .LBB17_3: ; %.preheader2
+; GFX11-DL-NEXT:    ; Parent Loop BB17_2 Depth=1
+; GFX11-DL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-DL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-DL-NEXT:    v_add_co_u32 v9, vcc_lo, v4, s0
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
+; GFX11-DL-NEXT:    v_add_co_u32 v11, vcc_lo, v2, s0
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
+; GFX11-DL-NEXT:    v_add_co_u32 v13, vcc_lo, v6, s0
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
+; GFX11-DL-NEXT:    s_clause 0x6
+; GFX11-DL-NEXT:    global_load_i8 v15, v[11:12], off offset:1
+; GFX11-DL-NEXT:    global_load_i8 v16, v[11:12], off offset:2
+; GFX11-DL-NEXT:    global_load_i8 v17, v[11:12], off offset:3
+; GFX11-DL-NEXT:    global_load_i8 v18, v[11:12], off offset:4
+; GFX11-DL-NEXT:    global_load_i8 v19, v[11:12], off offset:5
+; GFX11-DL-NEXT:    global_load_i8 v20, v[11:12], off offset:6
+; GFX11-DL-NEXT:    global_load_i8 v11, v[11:12], off offset:7
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_i8 v12, v[9:10], off
+; GFX11-DL-NEXT:    global_load_i8 v9, v[9:10], off offset:8
+; GFX11-DL-NEXT:    s_clause 0x8
+; GFX11-DL-NEXT:    global_load_i8 v10, v[13:14], off
+; GFX11-DL-NEXT:    global_load_i8 v21, v[13:14], off offset:1
+; GFX11-DL-NEXT:    global_load_i8 v22, v[13:14], off offset:2
+; GFX11-DL-NEXT:    global_load_i8 v23, v[13:14], off offset:3
+; GFX11-DL-NEXT:    global_load_i8 v24, v[13:14], off offset:4
+; GFX11-DL-NEXT:    global_load_i8 v25, v[13:14], off offset:5
+; GFX11-DL-NEXT:    global_load_i8 v26, v[13:14], off offset:6
+; GFX11-DL-NEXT:    global_load_i8 v27, v[13:14], off offset:7
+; GFX11-DL-NEXT:    global_load_i8 v13, v[13:14], off offset:8
+; GFX11-DL-NEXT:    s_add_u32 s0, s0, 9
+; GFX11-DL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-DL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-DL-NEXT:    s_cmp_lg_u64 s[0:1], 0x48
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-DL-NEXT:    v_perm_b32 v9, v9, v11, 0x4000c0c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-DL-NEXT:    v_mad_i32_i24 v8, v10, v12, v8
+; GFX11-DL-NEXT:    v_perm_b32 v10, v16, v15, 0xc0c0400
+; GFX11-DL-NEXT:    v_perm_b32 v12, v18, v17, 0x4000c0c
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-DL-NEXT:    v_perm_b32 v14, v22, v21, 0xc0c0400
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-DL-NEXT:    v_perm_b32 v15, v24, v23, 0x4000c0c
+; GFX11-DL-NEXT:    v_perm_b32 v16, v20, v19, 0xc0c0400
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_perm_b32 v11, v26, v25, 0xc0c0400
+; GFX11-DL-NEXT:    v_or_b32_e32 v10, v12, v10
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_perm_b32 v13, v13, v27, 0x4000c0c
+; GFX11-DL-NEXT:    v_or_b32_e32 v12, v15, v14
+; GFX11-DL-NEXT:    v_or_b32_e32 v9, v9, v16
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_or_b32_e32 v11, v13, v11
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v8, v12, v10, v8 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v8, v11, v9, v8 neg_lo:[1,1,0]
+; GFX11-DL-NEXT:    s_cbranch_scc1 .LBB17_3
+; GFX11-DL-NEXT:  ; %bb.4: ; %.110
+; GFX11-DL-NEXT:    ; in Loop: Header=BB17_2 Depth=1
+; GFX11-DL-NEXT:    v_lshlrev_b64 v[9:10], 2, v[0:1]
+; GFX11-DL-NEXT:    v_add_co_u32 v6, s0, 0x900, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX11-DL-NEXT:    v_add_co_u32 v9, vcc_lo, s2, v9
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo
+; GFX11-DL-NEXT:    v_add_co_u32 v11, vcc_lo, v0, 32
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
+; GFX11-DL-NEXT:    v_add_co_u32 v2, vcc_lo, 0x900, v2
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-DL-NEXT:    v_add_co_u32 v4, vcc_lo, 0x900, v4
+; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-DL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-DL-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-DL-NEXT:    global_store_b32 v[9:10], v8, off
+; GFX11-DL-NEXT:    s_or_b32 s6, vcc_lo, s6
+; GFX11-DL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-DL-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-DL-NEXT:    s_cbranch_execnz .LBB17_2
+; GFX11-DL-NEXT:  .LBB17_5: ; %._crit_edge
+; GFX11-DL-NEXT:    s_endpgm
+.entry:
+  %workitemx = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %sworkitemx = sext i32 %workitemx to i64
+  %workitemy = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %sworkitemy = sext i32 %workitemy to i64
+  %workitemz = tail call i32 @llvm.amdgcn.workitem.id.z()
+  %sworkitemz = sext i32 %workitemz to i64
+  %ivtemp0 = add nsw i64 %sworkitemy, %sworkitemz
+  %ivtemp1 = shl nsw i64 %ivtemp0, 5
+  %iv = add nsw i64 %ivtemp1, %sworkitemx
+  %cmp = icmp slt i64 %sworkitemx, 2
+  br i1 %cmp, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %.entry, %.110
+  %phi = phi i64 [ %outerlimit, %.110 ], [ %iv, %.entry ]
+  %outptr = getelementptr i32, ptr addrspace(1) %inptr2, i64 %phi
+  %scalarmul = mul nsw i64 %phi, 72
+  br label %.preheader2
+
+.preheader2:                                      ; preds = %.lr.ph, %.preheader2
+  %phi1 = phi i64 [ 0, %.lr.ph ], [ %limit, %.preheader2 ]
+  %.lcssa4.lcssa67 = phi i32 [ 0, %.lr.ph ], [ %ivadd9, %.preheader2 ]
+  %mul0 = mul nuw nsw i64 %phi1, 9
+  %scalaradd = add nsw i64 %mul0, %scalarmul
+  %gep10 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %scalaradd
+  %l10 = load i8, ptr addrspace(1) %gep10, align 1
+  %gep11 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %scalaradd
+  %l11 = load i8, ptr addrspace(1) %gep11, align 1
+  %op11 = sext i8 %l10 to i32
+  %op10 = sext i8 %l11 to i32
+  %mul1 = mul nsw i32 %op10, %op11
+  %ivadd1 = add i32 %mul1, %.lcssa4.lcssa67
+  %off2 = add nsw i64 %scalaradd, 1
+  %gep21 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off2
+  %l21 = load i8, ptr addrspace(1) %gep21, align 1
+  %gep20 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off2
+  %l20 = load i8, ptr addrspace(1) %gep20, align 1
+  %op21 = sext i8 %l21 to i32
+  %op20 = sext i8 %l20 to i32
+  %mul2 = mul nsw i32 %op20, %op21
+  %ivadd2 = add i32 %mul2, %ivadd1
+  %off3 = add nsw i64 %scalaradd, 2
+  %gep31 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off3
+  %l31 = load i8, ptr addrspace(1) %gep31, align 1
+  %gep30 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off3
+  %l30 = load i8, ptr addrspace(1) %gep30, align 1
+  %op31 = sext i8 %l31 to i32
+  %op30 = sext i8 %l30 to i32
+  %mul3 = mul nsw i32 %op30, %op31
+  %ivadd3 = add i32 %mul3, %ivadd2
+  %off4 = add nsw i64 %scalaradd, 3
+  %gep41 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off4
+  %l41 = load i8, ptr addrspace(1) %gep41, align 1
+  %gep40 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off4
+  %l40 = load i8, ptr addrspace(1) %gep40, align 1
+  %op41 = sext i8 %l41 to i32
+  %op40 = sext i8 %l40 to i32
+  %mul4 = mul nsw i32 %op40, %op41
+  %ivadd4 = add i32 %mul4, %ivadd3
+  %off5 = add nsw i64 %scalaradd, 4
+  %gep51 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off5
+  %l51 = load i8, ptr addrspace(1) %gep51, align 1
+  %gep50 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off5
+  %l50 = load i8, ptr addrspace(1) %gep50, align 1
+  %op51 = sext i8 %l51 to i32
+  %op50 = sext i8 %l50 to i32
+  %mul5 = mul nsw i32 %op50, %op51
+  %ivadd5 = add i32 %mul5, %ivadd4
+  %off6 = add nsw i64 %scalaradd, 5
+  %gep61 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off6
+  %l61 = load i8, ptr addrspace(1) %gep61, align 1
+  %gep60 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off6
+  %l60 = load i8, ptr addrspace(1) %gep60, align 1
+  %op61 = sext i8 %l61 to i32
+  %op60 = sext i8 %l60 to i32
+  %mul6 = mul nsw i32 %op60, %op61
+  %ivadd6 = add i32 %mul6, %ivadd5
+  %off7 = add nsw i64 %scalaradd, 6
+  %gep71 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off7
+  %l71 = load i8, ptr addrspace(1) %gep71, align 1
+  %gep70 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off7
+  %l70 = load i8, ptr addrspace(1) %gep70, align 1
+  %op71 = sext i8 %l71 to i32
+  %op70 = sext i8 %l70 to i32
+  %mul7 = mul nsw i32 %op70, %op71
+  %ivadd7 = add i32 %mul7, %ivadd6
+  %off8 = add nsw i64 %scalaradd, 7
+  %gep81 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off8
+  %l81 = load i8, ptr addrspace(1) %gep81, align 1
+  %gep80 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off8
+  %l80 = load i8, ptr addrspace(1) %gep80, align 1
+  %op81 = sext i8 %l81 to i32
+  %op80 = sext i8 %l80 to i32
+  %mul8 = mul nsw i32 %op80, %op81
+  %ivadd8 = add i32 %mul8, %ivadd7
+  %off9 = add nsw i64 %scalaradd, 8
+  %gep91 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off9
+  %l91 = load i8, ptr addrspace(1) %gep91, align 1
+  %gep90 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off9
+  %l90 = load i8, ptr addrspace(1) %gep90, align 1
+  %op91 = sext i8 %l91 to i32
+  %op90 = sext i8 %l90 to i32
+  %mul9 = mul nsw i32 %op90, %op91
+  %ivadd9 = add i32 %mul9, %ivadd8
+  %limit = add nuw nsw i64 %phi1, 1
+  %exitcond.not = icmp eq i64 %limit, 8
+  br i1 %exitcond.not, label %.110, label %.preheader2
+
+.110:                                              ; preds = %.preheader2
+  store i32 %ivadd9, ptr addrspace(1) %outptr, align 4
+  %outerlimit = add nsw i64 %phi, 32
+  %outerexitcond = icmp slt i64 %phi, -30
+  br i1 %outerexitcond, label %.lr.ph, label %._crit_edge
+
+._crit_edge:                                      ; preds = %.110, %.3
+  ret void
+}
+
+
 declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare i32 @llvm.amdgcn.workitem.id.z()

>From 164e1a51f4d75722b328b8bc4c6ee20bf1be33e7 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 17:32:31 -0800
Subject: [PATCH 2/5] Fix test

Change-Id: Ifa2ee3caaf13bc563119f79a241c3231557d401f
---
 llvm/test/CodeGen/AMDGPU/idot4-combine.ll | 116 ++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/idot4-combine.ll

diff --git a/llvm/test/CodeGen/AMDGPU/idot4-combine.ll b/llvm/test/CodeGen/AMDGPU/idot4-combine.ll
new file mode 100644
index 00000000000000..18920fd4e40a24
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/idot4-combine.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 --start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+; The first (A) operand of the v_dot4 is derived from the LHS of the mul chain (that is %l6080, %l7081, %l8082, %l9083). 
+; These correspond to the 5th, 6th, 7th and 8th byte in the load %7. 
+; Confirm that we are actually accessing these bytes. 
+;
+; Previously, we used the dword offset from the corresponding byte in the second (B) operand. 
+; The result was to access the 3rd byte of %7 instead of the 7th (i.e. a dword offset of 0 instead of 1).
+
+define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2, ptr addrspace(1) %outptr) local_unnamed_addr #0 {
+; GFX11-LABEL: ByteOffsetCorrectness:
+; GFX11:       ; %bb.0: ; %.entry
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 20, 10
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x3c
+; GFX11-NEXT:    v_bfe_u32 v6, v0, 10, 10
+; GFX11-NEXT:    v_and_b32_e32 v7, 0x3ff, v0
+; GFX11-NEXT:    v_mul_hi_u32_u24_e32 v1, 0x900, v2
+; GFX11-NEXT:    v_mul_u32_u24_e32 v0, 0x900, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, v6, v2
+; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0x900, v6, v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v6, 0
+; GFX11-NEXT:    v_mul_hi_u32_u24_e32 v3, 0x48, v7
+; GFX11-NEXT:    v_mul_u32_u24_e32 v2, 0x48, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, 0x900, v8, v[2:3]
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x48, v7, v[4:5]
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s5, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, s4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s5, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, s6, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_load_i8 v7, v[4:5], off offset:7
+; GFX11-NEXT:    global_load_i8 v2, v[2:3], off offset:8
+; GFX11-NEXT:    global_load_d16_b16 v6, v[4:5], off offset:5
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_i8 v3, v[0:1], off offset:8
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v7, 0x4000c0c
+; GFX11-NEXT:    v_perm_b32 v2, v6, v6, 0xc0c0100
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x4030201
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0]
+; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
+.entry:
+  %ByteOffsetCorrectness.kernarg.segment = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %workitemx = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %sworkitemx = sext i32 %workitemx to i64
+  %workitemy = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %sworkitemy = sext i32 %workitemy to i64
+  %workitemz = tail call i32 @llvm.amdgcn.workitem.id.z()
+  %sworkitemz = sext i32 %workitemz to i64
+  %ivtemp0 = add i64 %sworkitemy, %sworkitemz
+  %ivtemp1 = shl nsw i64 %ivtemp0, 5
+  %iv = add nsw i64 %ivtemp1, %sworkitemx
+  %0 = mul nsw i64 %ivtemp0, 2304
+  %1 = mul nsw i64 %sworkitemx, 72
+  %2 = add i64 %0, %1
+  %scevgep = getelementptr i8, ptr addrspace(1) %inptr0, i64 %2
+  %3 = mul nsw i64 %sworkitemy, 2304
+  %4 = mul nsw i64 %sworkitemz, 2304
+  %5 = add i64 %3, %4
+  %6 = add i64 %5, %1
+  %scevgep49 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %6
+  %scevgep55 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %2
+  %scevgep54 = getelementptr i8, ptr addrspace(1) %scevgep49, i64 0
+  %l10 = load i8, ptr addrspace(1) %scevgep54, align 1
+  %scevgep58 = getelementptr i8, ptr addrspace(1) %scevgep55, i64 0
+  %7 = load <9 x i8>, ptr addrspace(1) %scevgep58, align 1
+  %l6080 = extractelement <9 x i8> %7, i32 5
+  %l7081 = extractelement <9 x i8> %7, i32 6
+  %l8082 = extractelement <9 x i8> %7, i32 7
+  %l9083 = extractelement <9 x i8> %7, i32 8
+  %scevgep35 = getelementptr i8, ptr addrspace(1) %scevgep, i64 0
+  %scevgep36 = getelementptr i8, ptr addrspace(1) %scevgep35, i64 1
+  %8 = load <7 x i8>, ptr addrspace(1) %scevgep36, align 1
+  %l6188 = extractelement <7 x i8> %8, i32 4
+  %l7189 = extractelement <7 x i8> %8, i32 5
+  %l8190 = extractelement <7 x i8> %8, i32 6
+  %op61 = sext i8 %l6188 to i32
+  %op60 = sext i8 %l6080 to i32
+  %mul6 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op60, i32 %op61)
+  %ivadd6 = add i32 %mul6, 0
+  %op71 = sext i8 %l7189 to i32
+  %op70 = sext i8 %l7081 to i32
+  %mul7 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op70, i32 %op71)
+  %ivadd7 = add i32 %mul7, %ivadd6
+  %op81 = sext i8 %l8190 to i32
+  %op80 = sext i8 %l8082 to i32
+  %mul8 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op80, i32 %op81)
+  %ivadd8 = add i32 %mul8, %ivadd7
+  %scevgep53 = getelementptr i8, ptr addrspace(1) %scevgep54, i64 8
+  %l91 = load i8, ptr addrspace(1) %scevgep53, align 1
+  %op91 = sext i8 %l91 to i32
+  %op90 = sext i8 %l9083 to i32
+  %mul9 = call i32 @llvm.amdgcn.mul.i24.i32(i32 %op90, i32 %op91)
+  %ivadd9 = add i32 %mul9, %ivadd8
+  store i32 %ivadd9, ptr addrspace(1) %outptr, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare i32 @llvm.amdgcn.workitem.id.z()

>From 7d81864d9f68083be83ab8faf3497a88e6fc1763 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 17:36:41 -0800
Subject: [PATCH 3/5] Remove old test

Change-Id: I70cc33b3e3af22d276ede907d3cbf9a2132f6ce4
---
 llvm/test/CodeGen/AMDGPU/idot4s.ll | 848 +----------------------------
 1 file changed, 1 insertion(+), 847 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 15734094db42cd..17182b20bfba7d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3449,851 +3449,5 @@ entry:
   ret void
 }
 
-
-define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2) local_unnamed_addr {
-; GFX7-LABEL: ByteOffsetCorrectness:
-; GFX7:       ; %bb.0: ; %.entry
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_cmp_gt_i64_e32 vcc, 2, v[0:1]
-; GFX7-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7-NEXT:    s_cbranch_execz .LBB17_5
-; GFX7-NEXT:  ; %bb.1: ; %.lr.ph.preheader
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v2
-; GFX7-NEXT:    v_mul_hi_u32_u24_e32 v2, 0x48, v0
-; GFX7-NEXT:    v_mul_u32_u24_e32 v1, 0x48, v0
-; GFX7-NEXT:    s_movk_i32 s0, 0x900
-; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
-; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v3, s0, v[1:2]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 5, v3
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v6, v0
-; GFX7-NEXT:    v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, s11
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s10, v4
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v6, s9
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, s8, v4
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, 0x48
-; GFX7-NEXT:    s_movk_i32 s10, 0xffe1
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX7-NEXT:    s_mov_b32 s11, -1
-; GFX7-NEXT:    s_mov_b64 s[12:13], 0
-; GFX7-NEXT:  .LBB17_2: ; %.lr.ph
-; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB17_3 Depth 2
-; GFX7-NEXT:    v_mov_b32_e32 v8, 0
-; GFX7-NEXT:    s_mov_b64 s[0:1], s[8:9]
-; GFX7-NEXT:  .LBB17_3: ; %.preheader2
-; GFX7-NEXT:    ; Parent Loop BB17_2 Depth=1
-; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT:    buffer_load_sbyte v9, v[4:5], s[0:3], 0 addr64
-; GFX7-NEXT:    buffer_load_sbyte v10, v[4:5], s[0:3], 0 addr64 offset:1
-; GFX7-NEXT:    buffer_load_sbyte v11, v[4:5], s[0:3], 0 addr64 offset:2
-; GFX7-NEXT:    buffer_load_sbyte v12, v[4:5], s[0:3], 0 addr64 offset:3
-; GFX7-NEXT:    buffer_load_sbyte v13, v[4:5], s[0:3], 0 addr64 offset:4
-; GFX7-NEXT:    buffer_load_sbyte v14, v[4:5], s[0:3], 0 addr64 offset:5
-; GFX7-NEXT:    buffer_load_sbyte v15, v[4:5], s[0:3], 0 addr64 offset:6
-; GFX7-NEXT:    buffer_load_sbyte v16, v[4:5], s[0:3], 0 addr64 offset:7
-; GFX7-NEXT:    buffer_load_sbyte v17, v[4:5], s[0:3], 0 addr64 offset:8
-; GFX7-NEXT:    buffer_load_sbyte v18, v[0:1], s[0:3], 0 addr64
-; GFX7-NEXT:    buffer_load_sbyte v19, v[0:1], s[0:3], 0 addr64 offset:1
-; GFX7-NEXT:    buffer_load_sbyte v20, v[0:1], s[0:3], 0 addr64 offset:2
-; GFX7-NEXT:    buffer_load_sbyte v21, v[0:1], s[0:3], 0 addr64 offset:3
-; GFX7-NEXT:    buffer_load_sbyte v22, v[0:1], s[0:3], 0 addr64 offset:4
-; GFX7-NEXT:    buffer_load_sbyte v23, v[0:1], s[0:3], 0 addr64 offset:5
-; GFX7-NEXT:    buffer_load_sbyte v24, v[0:1], s[0:3], 0 addr64 offset:6
-; GFX7-NEXT:    buffer_load_sbyte v25, v[0:1], s[0:3], 0 addr64 offset:7
-; GFX7-NEXT:    buffer_load_sbyte v26, v[0:1], s[0:3], 0 addr64 offset:8
-; GFX7-NEXT:    s_add_u32 s0, s0, 9
-; GFX7-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, s[0:1], v[6:7]
-; GFX7-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX7-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NEXT:    v_mad_i32_i24 v8, v18, v9, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(7)
-; GFX7-NEXT:    v_mad_i32_i24 v8, v19, v10, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(6)
-; GFX7-NEXT:    v_mad_i32_i24 v8, v20, v11, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NEXT:    v_mad_i32_i24 v8, v21, v12, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_mad_i32_i24 v8, v22, v13, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NEXT:    v_mad_i32_i24 v8, v23, v14, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_mad_i32_i24 v8, v24, v15, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_mad_i32_i24 v8, v25, v16, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_i32_i24 v8, v26, v17, v8
-; GFX7-NEXT:    s_cbranch_vccnz .LBB17_3
-; GFX7-NEXT:  ; %bb.4: ; %.110
-; GFX7-NEXT:    ; in Loop: Header=BB17_2 Depth=1
-; GFX7-NEXT:    v_lshl_b64 v[9:10], v[2:3], 2
-; GFX7-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], v[2:3]
-; GFX7-NEXT:    buffer_store_dword v8, v[9:10], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v2
-; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x900, v4
-; GFX7-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GFX7-NEXT:    s_or_b64 s[12:13], s[0:1], s[12:13]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[12:13]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_2
-; GFX7-NEXT:  .LBB17_5: ; %._crit_edge
-; GFX7-NEXT:    s_endpgm
-;
-; GFX8-LABEL: ByteOffsetCorrectness:
-; GFX8:       ; %bb.0: ; %.entry
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 2, v[0:1]
-; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB17_5
-; GFX8-NEXT:  ; %bb.1: ; %.lr.ph.preheader
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v3, v2
-; GFX8-NEXT:    s_movk_i32 s0, 0x900
-; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v4, 0x900, v3
-; GFX8-NEXT:    v_mul_u32_u24_e32 v3, 0x900, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], v2, s0, v[3:4]
-; GFX8-NEXT:    s_movk_i32 s0, 0x48
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x34
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, s0, v[1:2]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 5, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; GFX8-NEXT:    v_addc_u32_e64 v1, s[0:1], 0, 0, vcc
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s7
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s6, v2
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v4, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v2
-; GFX8-NEXT:    s_movk_i32 s4, 0xffe1
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v4, v3, vcc
-; GFX8-NEXT:    s_mov_b32 s5, -1
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB17_2: ; %.lr.ph
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB17_3 Depth 2
-; GFX8-NEXT:    v_mov_b32_e32 v10, 0
-; GFX8-NEXT:    s_mov_b64 s[0:1], 0
-; GFX8-NEXT:  .LBB17_3: ; %.preheader2
-; GFX8-NEXT:    ; Parent Loop BB17_2 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s0, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v9, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v6
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v7, v3, vcc
-; GFX8-NEXT:    flat_load_sbyte v11, v[4:5]
-; GFX8-NEXT:    flat_load_sbyte v12, v[2:3]
-; GFX8-NEXT:    s_add_u32 s0, s0, 9
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0x48
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v12, v12, v11, v10
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v2
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 2, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 2, v2
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 3, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 3, v2
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 4, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 4, v2
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 5, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 5, v2
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 6, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 6, v2
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v12, v10, v13, v12
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 7, v4
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
-; GFX8-NEXT:    flat_load_sbyte v13, v[10:11]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 7, v2
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
-; GFX8-NEXT:    flat_load_sbyte v10, v[10:11]
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_load_sbyte v4, v[4:5]
-; GFX8-NEXT:    flat_load_sbyte v2, v[2:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_mad_i32_i24 v10, v10, v13, v12
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v10, v2, v4, v10
-; GFX8-NEXT:    s_cbranch_scc1 .LBB17_3
-; GFX8-NEXT:  ; %bb.4: ; %.110
-; GFX8-NEXT:    ; in Loop: Header=BB17_2 Depth=1
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v4, s3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; GFX8-NEXT:    flat_store_dword v[2:3], v10
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x900, v6
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x900, v8
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GFX8-NEXT:    s_or_b64 s[6:7], s[0:1], s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v3
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_2
-; GFX8-NEXT:  .LBB17_5: ; %._crit_edge
-; GFX8-NEXT:    s_endpgm
-;
-; GFX9-NODL-LABEL: ByteOffsetCorrectness:
-; GFX9-NODL:       ; %bb.0: ; %.entry
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NODL-NEXT:    v_cmp_gt_i64_e32 vcc, 2, v[0:1]
-; GFX9-NODL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX9-NODL-NEXT:    s_cbranch_execz .LBB17_5
-; GFX9-NODL-NEXT:  ; %bb.1: ; %.lr.ph.preheader
-; GFX9-NODL-NEXT:    v_add_u32_e32 v10, v3, v2
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
-; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v1, 5, v10
-; GFX9-NODL-NEXT:    s_movk_i32 s3, 0x900
-; GFX9-NODL-NEXT:    v_mul_hi_u32_u24_e32 v9, 0x900, v2
-; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v8, 0x900, v2
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v4, vcc, v1, v0
-; GFX9-NODL-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
-; GFX9-NODL-NEXT:    v_mul_hi_u32_u24_e32 v7, 0x48, v0
-; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v6, 0x48, v0
-; GFX9-NODL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
-; GFX9-NODL-NEXT:    s_movk_i32 s2, 0x48
-; GFX9-NODL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
-; GFX9-NODL-NEXT:    v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
-; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v0, vcc, s8, v6
-; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
-; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v6, vcc, s10, v6
-; GFX9-NODL-NEXT:    s_movk_i32 s6, 0xffe1
-; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
-; GFX9-NODL-NEXT:    s_mov_b32 s7, -1
-; GFX9-NODL-NEXT:    s_mov_b64 s[8:9], 0
-; GFX9-NODL-NEXT:  .LBB17_2: ; %.lr.ph
-; GFX9-NODL-NEXT:    ; =>This Loop Header: Depth=1
-; GFX9-NODL-NEXT:    ; Child Loop BB17_3 Depth 2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NODL-NEXT:    s_mov_b64 s[10:11], 0
-; GFX9-NODL-NEXT:  .LBB17_3: ; %.preheader2
-; GFX9-NODL-NEXT:    ; Parent Loop BB17_2 Depth=1
-; GFX9-NODL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v12, s11
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v9, vcc, s10, v6
-; GFX9-NODL-NEXT:    v_add_co_u32_e64 v11, s[0:1], s10, v0
-; GFX9-NODL-NEXT:    v_add_co_u32_e64 v13, s[2:3], s10, v2
-; GFX9-NODL-NEXT:    v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
-; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
-; GFX9-NODL-NEXT:    v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
-; GFX9-NODL-NEXT:    global_load_sbyte v15, v[13:14], off
-; GFX9-NODL-NEXT:    global_load_sbyte v16, v[11:12], off offset:1
-; GFX9-NODL-NEXT:    global_load_sbyte v17, v[11:12], off offset:2
-; GFX9-NODL-NEXT:    global_load_sbyte v18, v[11:12], off offset:3
-; GFX9-NODL-NEXT:    global_load_sbyte v19, v[11:12], off offset:4
-; GFX9-NODL-NEXT:    global_load_sbyte v20, v[11:12], off offset:5
-; GFX9-NODL-NEXT:    global_load_sbyte v21, v[11:12], off offset:6
-; GFX9-NODL-NEXT:    global_load_sbyte v22, v[11:12], off offset:7
-; GFX9-NODL-NEXT:    global_load_sbyte v23, v[9:10], off
-; GFX9-NODL-NEXT:    global_load_sbyte v24, v[9:10], off offset:1
-; GFX9-NODL-NEXT:    global_load_sbyte v25, v[9:10], off offset:2
-; GFX9-NODL-NEXT:    global_load_sbyte v26, v[9:10], off offset:3
-; GFX9-NODL-NEXT:    global_load_sbyte v27, v[9:10], off offset:4
-; GFX9-NODL-NEXT:    global_load_sbyte v28, v[9:10], off offset:5
-; GFX9-NODL-NEXT:    global_load_sbyte v29, v[9:10], off offset:6
-; GFX9-NODL-NEXT:    ; kill: killed $vgpr11 killed $vgpr12
-; GFX9-NODL-NEXT:    global_load_sbyte v11, v[9:10], off offset:7
-; GFX9-NODL-NEXT:    global_load_sbyte v12, v[13:14], off offset:8
-; GFX9-NODL-NEXT:    global_load_sbyte v30, v[9:10], off offset:8
-; GFX9-NODL-NEXT:    s_add_u32 s10, s10, 9
-; GFX9-NODL-NEXT:    s_addc_u32 s11, s11, 0
-; GFX9-NODL-NEXT:    s_cmp_lg_u64 s[10:11], 0x48
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v23, v15, v8
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v24, v16, v8
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v25, v17, v8
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v26, v18, v8
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v27, v19, v8
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v28, v20, v8
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v29, v21, v8
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v11, v22, v8
-; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_i32_i24 v8, v30, v12, v8
-; GFX9-NODL-NEXT:    s_cbranch_scc1 .LBB17_3
-; GFX9-NODL-NEXT:  ; %bb.4: ; %.110
-; GFX9-NODL-NEXT:    ; in Loop: Header=BB17_2 Depth=1
-; GFX9-NODL-NEXT:    v_lshlrev_b64 v[9:10], 2, v[4:5]
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v9, vcc, s4, v9
-; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
-; GFX9-NODL-NEXT:    global_store_dword v[9:10], v8, off
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v8, vcc, 32, v4
-; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x900, v0
-; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v2, vcc, 0x900, v2
-; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NODL-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], v[4:5]
-; GFX9-NODL-NEXT:    v_add_co_u32_e32 v6, vcc, 0x900, v6
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-NODL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NODL-NEXT:    s_or_b64 s[8:9], s[0:1], s[8:9]
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-NODL-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX9-NODL-NEXT:    s_cbranch_execnz .LBB17_2
-; GFX9-NODL-NEXT:  .LBB17_5: ; %._crit_edge
-; GFX9-NODL-NEXT:    s_endpgm
-;
-; GFX9-DL-LABEL: ByteOffsetCorrectness:
-; GFX9-DL:       ; %bb.0: ; %.entry
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DL-NEXT:    v_cmp_gt_i64_e32 vcc, 2, v[0:1]
-; GFX9-DL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX9-DL-NEXT:    s_cbranch_execz .LBB17_5
-; GFX9-DL-NEXT:  ; %bb.1: ; %.lr.ph.preheader
-; GFX9-DL-NEXT:    v_add_u32_e32 v10, v3, v2
-; GFX9-DL-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
-; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v1, 5, v10
-; GFX9-DL-NEXT:    s_movk_i32 s3, 0x900
-; GFX9-DL-NEXT:    v_mul_hi_u32_u24_e32 v9, 0x900, v2
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v8, 0x900, v2
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v4, vcc, v1, v0
-; GFX9-DL-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
-; GFX9-DL-NEXT:    v_mul_hi_u32_u24_e32 v7, 0x48, v0
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v6, 0x48, v0
-; GFX9-DL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
-; GFX9-DL-NEXT:    s_movk_i32 s2, 0x48
-; GFX9-DL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
-; GFX9-DL-NEXT:    v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
-; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v0, vcc, s8, v6
-; GFX9-DL-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
-; GFX9-DL-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v6, vcc, s10, v6
-; GFX9-DL-NEXT:    s_movk_i32 s8, 0xffe1
-; GFX9-DL-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
-; GFX9-DL-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-DL-NEXT:    s_mov_b32 s12, 0xc0c0400
-; GFX9-DL-NEXT:    s_mov_b32 s9, -1
-; GFX9-DL-NEXT:    s_mov_b32 s13, 0x4000c0c
-; GFX9-DL-NEXT:  .LBB17_2: ; %.lr.ph
-; GFX9-DL-NEXT:    ; =>This Loop Header: Depth=1
-; GFX9-DL-NEXT:    ; Child Loop BB17_3 Depth 2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DL-NEXT:    s_mov_b64 s[10:11], 0
-; GFX9-DL-NEXT:  .LBB17_3: ; %.preheader2
-; GFX9-DL-NEXT:    ; Parent Loop BB17_2 Depth=1
-; GFX9-DL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v12, s11
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v9, vcc, s10, v6
-; GFX9-DL-NEXT:    v_add_co_u32_e64 v11, s[0:1], s10, v0
-; GFX9-DL-NEXT:    v_add_co_u32_e64 v13, s[2:3], s10, v2
-; GFX9-DL-NEXT:    v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
-; GFX9-DL-NEXT:    v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
-; GFX9-DL-NEXT:    v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
-; GFX9-DL-NEXT:    global_load_sbyte v15, v[11:12], off offset:1
-; GFX9-DL-NEXT:    global_load_sbyte v16, v[11:12], off offset:2
-; GFX9-DL-NEXT:    global_load_sbyte v17, v[11:12], off offset:3
-; GFX9-DL-NEXT:    global_load_sbyte v18, v[11:12], off offset:4
-; GFX9-DL-NEXT:    global_load_sbyte v19, v[11:12], off offset:5
-; GFX9-DL-NEXT:    global_load_sbyte v20, v[11:12], off offset:6
-; GFX9-DL-NEXT:    global_load_sbyte v21, v[11:12], off offset:7
-; GFX9-DL-NEXT:    global_load_sbyte v22, v[13:14], off
-; GFX9-DL-NEXT:    global_load_sbyte v23, v[13:14], off offset:8
-; GFX9-DL-NEXT:    global_load_sbyte v24, v[9:10], off
-; GFX9-DL-NEXT:    global_load_sbyte v25, v[9:10], off offset:1
-; GFX9-DL-NEXT:    global_load_sbyte v26, v[9:10], off offset:2
-; GFX9-DL-NEXT:    global_load_sbyte v27, v[9:10], off offset:3
-; GFX9-DL-NEXT:    global_load_sbyte v28, v[9:10], off offset:4
-; GFX9-DL-NEXT:    global_load_sbyte v29, v[9:10], off offset:5
-; GFX9-DL-NEXT:    ; kill: killed $vgpr13 killed $vgpr14
-; GFX9-DL-NEXT:    ; kill: killed $vgpr11 killed $vgpr12
-; GFX9-DL-NEXT:    global_load_sbyte v11, v[9:10], off offset:6
-; GFX9-DL-NEXT:    global_load_sbyte v12, v[9:10], off offset:7
-; GFX9-DL-NEXT:    global_load_sbyte v13, v[9:10], off offset:8
-; GFX9-DL-NEXT:    s_add_u32 s10, s10, 9
-; GFX9-DL-NEXT:    s_addc_u32 s11, s11, 0
-; GFX9-DL-NEXT:    s_cmp_lg_u64 s[10:11], 0x48
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(16)
-; GFX9-DL-NEXT:    v_perm_b32 v9, v16, v15, s12
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-DL-NEXT:    v_perm_b32 v10, v18, v17, s13
-; GFX9-DL-NEXT:    v_or_b32_e32 v9, v10, v9
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(12)
-; GFX9-DL-NEXT:    v_perm_b32 v16, v20, v19, s12
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-DL-NEXT:    v_perm_b32 v17, v23, v21, s13
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v8, v24, v22, v8
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-DL-NEXT:    v_perm_b32 v14, v26, v25, s12
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-DL-NEXT:    v_perm_b32 v15, v28, v27, s13
-; GFX9-DL-NEXT:    v_or_b32_e32 v10, v15, v14
-; GFX9-DL-NEXT:    v_dot4_i32_i8 v8, v10, v9, v8
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_perm_b32 v11, v11, v29, s12
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_perm_b32 v12, v13, v12, s13
-; GFX9-DL-NEXT:    v_or_b32_e32 v13, v17, v16
-; GFX9-DL-NEXT:    v_or_b32_e32 v11, v12, v11
-; GFX9-DL-NEXT:    v_dot4_i32_i8 v8, v11, v13, v8
-; GFX9-DL-NEXT:    s_cbranch_scc1 .LBB17_3
-; GFX9-DL-NEXT:  ; %bb.4: ; %.110
-; GFX9-DL-NEXT:    ; in Loop: Header=BB17_2 Depth=1
-; GFX9-DL-NEXT:    v_lshlrev_b64 v[9:10], 2, v[4:5]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v11, s5
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v9, vcc, s4, v9
-; GFX9-DL-NEXT:    v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
-; GFX9-DL-NEXT:    global_store_dword v[9:10], v8, off
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v8, vcc, 32, v4
-; GFX9-DL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x900, v0
-; GFX9-DL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v2, vcc, 0x900, v2
-; GFX9-DL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-DL-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[8:9], v[4:5]
-; GFX9-DL-NEXT:    v_add_co_u32_e32 v6, vcc, 0x900, v6
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-DL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-DL-NEXT:    s_or_b64 s[6:7], s[0:1], s[6:7]
-; GFX9-DL-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-DL-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX9-DL-NEXT:    s_cbranch_execnz .LBB17_2
-; GFX9-DL-NEXT:  .LBB17_5: ; %._crit_edge
-; GFX9-DL-NEXT:    s_endpgm
-;
-; GFX10-DL-LABEL: ByteOffsetCorrectness:
-; GFX10-DL:       ; %bb.0: ; %.entry
-; GFX10-DL-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1]
-; GFX10-DL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX10-DL-NEXT:    s_cbranch_execz .LBB17_5
-; GFX10-DL-NEXT:  ; %bb.1: ; %.lr.ph.preheader
-; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x24
-; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
-; GFX10-DL-NEXT:    v_mul_hi_u32_u24_e32 v5, 0x900, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v4, 0x900, v2
-; GFX10-DL-NEXT:    v_add_nc_u32_e32 v6, v3, v2
-; GFX10-DL-NEXT:    v_mul_hi_u32_u24_e32 v2, 0x48, v0
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, 0x48, v0
-; GFX10-DL-NEXT:    s_movk_i32 s2, 0xffe1
-; GFX10-DL-NEXT:    v_mad_u64_u32 v[3:4], s0, 0x900, v3, v[4:5]
-; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v8, 5, v6
-; GFX10-DL-NEXT:    v_mad_u64_u32 v[6:7], s0, 0x900, v6, v[1:2]
-; GFX10-DL-NEXT:    s_mov_b32 s3, -1
-; GFX10-DL-NEXT:    s_mov_b32 s6, 0
-; GFX10-DL-NEXT:    v_mad_u64_u32 v[4:5], s0, 0x48, v0, v[3:4]
-; GFX10-DL-NEXT:    v_add_co_u32 v0, s0, v8, v0
-; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_add_co_u32 v2, vcc_lo, s8, v6
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s9, v7, vcc_lo
-; GFX10-DL-NEXT:    v_add_co_u32 v4, vcc_lo, s8, v4
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s9, v5, vcc_lo
-; GFX10-DL-NEXT:    v_add_co_u32 v6, vcc_lo, s10, v6
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, 0, s0
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s11, v7, vcc_lo
-; GFX10-DL-NEXT:  .LBB17_2: ; %.lr.ph
-; GFX10-DL-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-DL-NEXT:    ; Child Loop BB17_3 Depth 2
-; GFX10-DL-NEXT:    v_mov_b32_e32 v8, 0
-; GFX10-DL-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-DL-NEXT:  .LBB17_3: ; %.preheader2
-; GFX10-DL-NEXT:    ; Parent Loop BB17_2 Depth=1
-; GFX10-DL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX10-DL-NEXT:    v_add_co_u32 v9, vcc_lo, v4, s0
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
-; GFX10-DL-NEXT:    v_add_co_u32 v11, vcc_lo, v2, s0
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
-; GFX10-DL-NEXT:    v_add_co_u32 v13, vcc_lo, v6, s0
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
-; GFX10-DL-NEXT:    s_clause 0x6
-; GFX10-DL-NEXT:    global_load_sbyte v15, v[11:12], off offset:1
-; GFX10-DL-NEXT:    global_load_sbyte v16, v[11:12], off offset:2
-; GFX10-DL-NEXT:    global_load_sbyte v17, v[11:12], off offset:3
-; GFX10-DL-NEXT:    global_load_sbyte v18, v[11:12], off offset:4
-; GFX10-DL-NEXT:    global_load_sbyte v19, v[11:12], off offset:5
-; GFX10-DL-NEXT:    global_load_sbyte v20, v[11:12], off offset:6
-; GFX10-DL-NEXT:    global_load_sbyte v21, v[11:12], off offset:7
-; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_sbyte v22, v[9:10], off
-; GFX10-DL-NEXT:    global_load_sbyte v23, v[9:10], off offset:8
-; GFX10-DL-NEXT:    s_clause 0x8
-; GFX10-DL-NEXT:    global_load_sbyte v24, v[13:14], off
-; GFX10-DL-NEXT:    global_load_sbyte v25, v[13:14], off offset:1
-; GFX10-DL-NEXT:    global_load_sbyte v26, v[13:14], off offset:2
-; GFX10-DL-NEXT:    global_load_sbyte v27, v[13:14], off offset:3
-; GFX10-DL-NEXT:    global_load_sbyte v28, v[13:14], off offset:4
-; GFX10-DL-NEXT:    global_load_sbyte v29, v[13:14], off offset:5
-; GFX10-DL-NEXT:    ; meta instruction
-; GFX10-DL-NEXT:    ; meta instruction
-; GFX10-DL-NEXT:    global_load_sbyte v9, v[13:14], off offset:6
-; GFX10-DL-NEXT:    global_load_sbyte v10, v[13:14], off offset:7
-; GFX10-DL-NEXT:    global_load_sbyte v11, v[13:14], off offset:8
-; GFX10-DL-NEXT:    s_add_u32 s0, s0, 9
-; GFX10-DL-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-DL-NEXT:    s_cmp_lg_u64 s[0:1], 0x48
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(16)
-; GFX10-DL-NEXT:    v_perm_b32 v12, v16, v15, 0xc0c0400
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(14)
-; GFX10-DL-NEXT:    v_perm_b32 v13, v18, v17, 0x4000c0c
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(12)
-; GFX10-DL-NEXT:    v_perm_b32 v16, v20, v19, 0xc0c0400
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-DL-NEXT:    v_perm_b32 v17, v23, v21, 0x4000c0c
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-DL-NEXT:    v_mad_i32_i24 v8, v24, v22, v8
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-DL-NEXT:    v_perm_b32 v14, v26, v25, 0xc0c0400
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-DL-NEXT:    v_perm_b32 v15, v28, v27, 0x4000c0c
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_perm_b32 v9, v9, v29, 0xc0c0400
-; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_perm_b32 v10, v11, v10, 0x4000c0c
-; GFX10-DL-NEXT:    v_or_b32_e32 v11, v13, v12
-; GFX10-DL-NEXT:    v_or_b32_e32 v12, v15, v14
-; GFX10-DL-NEXT:    v_or_b32_e32 v13, v17, v16
-; GFX10-DL-NEXT:    v_or_b32_e32 v9, v10, v9
-; GFX10-DL-NEXT:    v_dot4c_i32_i8 v8, v12, v11
-; GFX10-DL-NEXT:    v_dot4c_i32_i8 v8, v9, v13
-; GFX10-DL-NEXT:    s_cbranch_scc1 .LBB17_3
-; GFX10-DL-NEXT:  ; %bb.4: ; %.110
-; GFX10-DL-NEXT:    ; in Loop: Header=BB17_2 Depth=1
-; GFX10-DL-NEXT:    v_lshlrev_b64 v[9:10], 2, v[0:1]
-; GFX10-DL-NEXT:    v_add_co_u32 v6, s0, 0x900, v6
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
-; GFX10-DL-NEXT:    v_add_co_u32 v9, vcc_lo, s4, v9
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s5, v10, vcc_lo
-; GFX10-DL-NEXT:    v_add_co_u32 v11, vcc_lo, v0, 32
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
-; GFX10-DL-NEXT:    v_add_co_u32 v2, vcc_lo, 0x900, v2
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-DL-NEXT:    v_add_co_u32 v4, vcc_lo, 0x900, v4
-; GFX10-DL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
-; GFX10-DL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX10-DL-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, v12
-; GFX10-DL-NEXT:    global_store_dword v[9:10], v8, off
-; GFX10-DL-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
-; GFX10-DL-NEXT:    s_cbranch_execnz .LBB17_2
-; GFX10-DL-NEXT:  .LBB17_5: ; %._crit_edge
-; GFX10-DL-NEXT:    s_endpgm
-;
-; GFX11-DL-LABEL: ByteOffsetCorrectness:
-; GFX11-DL:       ; %bb.0: ; %.entry
-; GFX11-DL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX11-DL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_cmpx_gt_i64_e32 2, v[1:2]
-; GFX11-DL-NEXT:    s_cbranch_execz .LBB17_5
-; GFX11-DL-NEXT:  ; %bb.1: ; %.lr.ph.preheader
-; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 20, 10
-; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX11-DL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX11-DL-NEXT:    v_mul_hi_u32_u24_e32 v4, 0x48, v1
-; GFX11-DL-NEXT:    s_load_b64 s[2:3], s[2:3], 0x34
-; GFX11-DL-NEXT:    v_mul_hi_u32_u24_e32 v3, 0x900, v5
-; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v2, 0x900, v5
-; GFX11-DL-NEXT:    v_add_nc_u32_e32 v9, v0, v5
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_mad_u64_u32 v[5:6], null, 0x900, v0, v[2:3]
-; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v3, 0x48, v1
-; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 5, v9
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-NEXT:    v_mad_u64_u32 v[7:8], null, 0x900, v9, v[3:4]
-; GFX11-DL-NEXT:    v_mad_u64_u32 v[9:10], null, 0x48, v1, v[5:6]
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_add_co_u32 v0, s0, v0, v1
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s0
-; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-DL-NEXT:    v_add_co_u32 v2, vcc_lo, s4, v7
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s5, v8, vcc_lo
-; GFX11-DL-NEXT:    v_add_co_u32 v4, vcc_lo, s4, v9
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s5, v10, vcc_lo
-; GFX11-DL-NEXT:    v_add_co_u32 v6, vcc_lo, s6, v7
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s7, v8, vcc_lo
-; GFX11-DL-NEXT:    s_movk_i32 s4, 0xffe1
-; GFX11-DL-NEXT:    s_mov_b32 s5, -1
-; GFX11-DL-NEXT:    s_mov_b32 s6, 0
-; GFX11-DL-NEXT:  .LBB17_2: ; %.lr.ph
-; GFX11-DL-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-DL-NEXT:    ; Child Loop BB17_3 Depth 2
-; GFX11-DL-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-DL-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-DL-NEXT:  .LBB17_3: ; %.preheader2
-; GFX11-DL-NEXT:    ; Parent Loop BB17_2 Depth=1
-; GFX11-DL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-DL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-DL-NEXT:    v_add_co_u32 v9, vcc_lo, v4, s0
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
-; GFX11-DL-NEXT:    v_add_co_u32 v11, vcc_lo, v2, s0
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
-; GFX11-DL-NEXT:    v_add_co_u32 v13, vcc_lo, v6, s0
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
-; GFX11-DL-NEXT:    s_clause 0x6
-; GFX11-DL-NEXT:    global_load_i8 v15, v[11:12], off offset:1
-; GFX11-DL-NEXT:    global_load_i8 v16, v[11:12], off offset:2
-; GFX11-DL-NEXT:    global_load_i8 v17, v[11:12], off offset:3
-; GFX11-DL-NEXT:    global_load_i8 v18, v[11:12], off offset:4
-; GFX11-DL-NEXT:    global_load_i8 v19, v[11:12], off offset:5
-; GFX11-DL-NEXT:    global_load_i8 v20, v[11:12], off offset:6
-; GFX11-DL-NEXT:    global_load_i8 v11, v[11:12], off offset:7
-; GFX11-DL-NEXT:    s_clause 0x1
-; GFX11-DL-NEXT:    global_load_i8 v12, v[9:10], off
-; GFX11-DL-NEXT:    global_load_i8 v9, v[9:10], off offset:8
-; GFX11-DL-NEXT:    s_clause 0x8
-; GFX11-DL-NEXT:    global_load_i8 v10, v[13:14], off
-; GFX11-DL-NEXT:    global_load_i8 v21, v[13:14], off offset:1
-; GFX11-DL-NEXT:    global_load_i8 v22, v[13:14], off offset:2
-; GFX11-DL-NEXT:    global_load_i8 v23, v[13:14], off offset:3
-; GFX11-DL-NEXT:    global_load_i8 v24, v[13:14], off offset:4
-; GFX11-DL-NEXT:    global_load_i8 v25, v[13:14], off offset:5
-; GFX11-DL-NEXT:    global_load_i8 v26, v[13:14], off offset:6
-; GFX11-DL-NEXT:    global_load_i8 v27, v[13:14], off offset:7
-; GFX11-DL-NEXT:    global_load_i8 v13, v[13:14], off offset:8
-; GFX11-DL-NEXT:    s_add_u32 s0, s0, 9
-; GFX11-DL-NEXT:    s_addc_u32 s1, s1, 0
-; GFX11-DL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-DL-NEXT:    s_cmp_lg_u64 s[0:1], 0x48
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-DL-NEXT:    v_perm_b32 v9, v9, v11, 0x4000c0c
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-DL-NEXT:    v_mad_i32_i24 v8, v10, v12, v8
-; GFX11-DL-NEXT:    v_perm_b32 v10, v16, v15, 0xc0c0400
-; GFX11-DL-NEXT:    v_perm_b32 v12, v18, v17, 0x4000c0c
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-DL-NEXT:    v_perm_b32 v14, v22, v21, 0xc0c0400
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-DL-NEXT:    v_perm_b32 v15, v24, v23, 0x4000c0c
-; GFX11-DL-NEXT:    v_perm_b32 v16, v20, v19, 0xc0c0400
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-NEXT:    v_perm_b32 v11, v26, v25, 0xc0c0400
-; GFX11-DL-NEXT:    v_or_b32_e32 v10, v12, v10
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT:    v_perm_b32 v13, v13, v27, 0x4000c0c
-; GFX11-DL-NEXT:    v_or_b32_e32 v12, v15, v14
-; GFX11-DL-NEXT:    v_or_b32_e32 v9, v9, v16
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_or_b32_e32 v11, v13, v11
-; GFX11-DL-NEXT:    v_dot4_i32_iu8 v8, v12, v10, v8 neg_lo:[1,1,0]
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_dot4_i32_iu8 v8, v11, v9, v8 neg_lo:[1,1,0]
-; GFX11-DL-NEXT:    s_cbranch_scc1 .LBB17_3
-; GFX11-DL-NEXT:  ; %bb.4: ; %.110
-; GFX11-DL-NEXT:    ; in Loop: Header=BB17_2 Depth=1
-; GFX11-DL-NEXT:    v_lshlrev_b64 v[9:10], 2, v[0:1]
-; GFX11-DL-NEXT:    v_add_co_u32 v6, s0, 0x900, v6
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
-; GFX11-DL-NEXT:    v_add_co_u32 v9, vcc_lo, s2, v9
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo
-; GFX11-DL-NEXT:    v_add_co_u32 v11, vcc_lo, v0, 32
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
-; GFX11-DL-NEXT:    v_add_co_u32 v2, vcc_lo, 0x900, v2
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX11-DL-NEXT:    v_add_co_u32 v4, vcc_lo, 0x900, v4
-; GFX11-DL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
-; GFX11-DL-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-DL-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-DL-NEXT:    global_store_b32 v[9:10], v8, off
-; GFX11-DL-NEXT:    s_or_b32 s6, vcc_lo, s6
-; GFX11-DL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-DL-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s6
-; GFX11-DL-NEXT:    s_cbranch_execnz .LBB17_2
-; GFX11-DL-NEXT:  .LBB17_5: ; %._crit_edge
-; GFX11-DL-NEXT:    s_endpgm
-.entry:
-  %workitemx = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %sworkitemx = sext i32 %workitemx to i64
-  %workitemy = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %sworkitemy = sext i32 %workitemy to i64
-  %workitemz = tail call i32 @llvm.amdgcn.workitem.id.z()
-  %sworkitemz = sext i32 %workitemz to i64
-  %ivtemp0 = add nsw i64 %sworkitemy, %sworkitemz
-  %ivtemp1 = shl nsw i64 %ivtemp0, 5
-  %iv = add nsw i64 %ivtemp1, %sworkitemx
-  %cmp = icmp slt i64 %sworkitemx, 2
-  br i1 %cmp, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %.entry, %.110
-  %phi = phi i64 [ %outerlimit, %.110 ], [ %iv, %.entry ]
-  %outptr = getelementptr i32, ptr addrspace(1) %inptr2, i64 %phi
-  %scalarmul = mul nsw i64 %phi, 72
-  br label %.preheader2
-
-.preheader2:                                      ; preds = %.lr.ph, %.preheader2
-  %phi1 = phi i64 [ 0, %.lr.ph ], [ %limit, %.preheader2 ]
-  %.lcssa4.lcssa67 = phi i32 [ 0, %.lr.ph ], [ %ivadd9, %.preheader2 ]
-  %mul0 = mul nuw nsw i64 %phi1, 9
-  %scalaradd = add nsw i64 %mul0, %scalarmul
-  %gep10 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %scalaradd
-  %l10 = load i8, ptr addrspace(1) %gep10, align 1
-  %gep11 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %scalaradd
-  %l11 = load i8, ptr addrspace(1) %gep11, align 1
-  %op11 = sext i8 %l10 to i32
-  %op10 = sext i8 %l11 to i32
-  %mul1 = mul nsw i32 %op10, %op11
-  %ivadd1 = add i32 %mul1, %.lcssa4.lcssa67
-  %off2 = add nsw i64 %scalaradd, 1
-  %gep21 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off2
-  %l21 = load i8, ptr addrspace(1) %gep21, align 1
-  %gep20 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off2
-  %l20 = load i8, ptr addrspace(1) %gep20, align 1
-  %op21 = sext i8 %l21 to i32
-  %op20 = sext i8 %l20 to i32
-  %mul2 = mul nsw i32 %op20, %op21
-  %ivadd2 = add i32 %mul2, %ivadd1
-  %off3 = add nsw i64 %scalaradd, 2
-  %gep31 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off3
-  %l31 = load i8, ptr addrspace(1) %gep31, align 1
-  %gep30 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off3
-  %l30 = load i8, ptr addrspace(1) %gep30, align 1
-  %op31 = sext i8 %l31 to i32
-  %op30 = sext i8 %l30 to i32
-  %mul3 = mul nsw i32 %op30, %op31
-  %ivadd3 = add i32 %mul3, %ivadd2
-  %off4 = add nsw i64 %scalaradd, 3
-  %gep41 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off4
-  %l41 = load i8, ptr addrspace(1) %gep41, align 1
-  %gep40 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off4
-  %l40 = load i8, ptr addrspace(1) %gep40, align 1
-  %op41 = sext i8 %l41 to i32
-  %op40 = sext i8 %l40 to i32
-  %mul4 = mul nsw i32 %op40, %op41
-  %ivadd4 = add i32 %mul4, %ivadd3
-  %off5 = add nsw i64 %scalaradd, 4
-  %gep51 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off5
-  %l51 = load i8, ptr addrspace(1) %gep51, align 1
-  %gep50 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off5
-  %l50 = load i8, ptr addrspace(1) %gep50, align 1
-  %op51 = sext i8 %l51 to i32
-  %op50 = sext i8 %l50 to i32
-  %mul5 = mul nsw i32 %op50, %op51
-  %ivadd5 = add i32 %mul5, %ivadd4
-  %off6 = add nsw i64 %scalaradd, 5
-  %gep61 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off6
-  %l61 = load i8, ptr addrspace(1) %gep61, align 1
-  %gep60 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off6
-  %l60 = load i8, ptr addrspace(1) %gep60, align 1
-  %op61 = sext i8 %l61 to i32
-  %op60 = sext i8 %l60 to i32
-  %mul6 = mul nsw i32 %op60, %op61
-  %ivadd6 = add i32 %mul6, %ivadd5
-  %off7 = add nsw i64 %scalaradd, 6
-  %gep71 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off7
-  %l71 = load i8, ptr addrspace(1) %gep71, align 1
-  %gep70 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off7
-  %l70 = load i8, ptr addrspace(1) %gep70, align 1
-  %op71 = sext i8 %l71 to i32
-  %op70 = sext i8 %l70 to i32
-  %mul7 = mul nsw i32 %op70, %op71
-  %ivadd7 = add i32 %mul7, %ivadd6
-  %off8 = add nsw i64 %scalaradd, 7
-  %gep81 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off8
-  %l81 = load i8, ptr addrspace(1) %gep81, align 1
-  %gep80 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off8
-  %l80 = load i8, ptr addrspace(1) %gep80, align 1
-  %op81 = sext i8 %l81 to i32
-  %op80 = sext i8 %l80 to i32
-  %mul8 = mul nsw i32 %op80, %op81
-  %ivadd8 = add i32 %mul8, %ivadd7
-  %off9 = add nsw i64 %scalaradd, 8
-  %gep91 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off9
-  %l91 = load i8, ptr addrspace(1) %gep91, align 1
-  %gep90 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off9
-  %l90 = load i8, ptr addrspace(1) %gep90, align 1
-  %op91 = sext i8 %l91 to i32
-  %op90 = sext i8 %l90 to i32
-  %mul9 = mul nsw i32 %op90, %op91
-  %ivadd9 = add i32 %mul9, %ivadd8
-  %limit = add nuw nsw i64 %phi1, 1
-  %exitcond.not = icmp eq i64 %limit, 8
-  br i1 %exitcond.not, label %.110, label %.preheader2
-
-.110:                                              ; preds = %.preheader2
-  store i32 %ivadd9, ptr addrspace(1) %outptr, align 4
-  %outerlimit = add nsw i64 %phi, 32
-  %outerexitcond = icmp slt i64 %phi, -30
-  br i1 %outerexitcond, label %.lr.ph, label %._crit_edge
-
-._crit_edge:                                      ; preds = %.110, %.3
-  ret void
-}
-
-
 declare i32 @llvm.amdgcn.workitem.id.x()
-declare i32 @llvm.amdgcn.workitem.id.y()
-declare i32 @llvm.amdgcn.workitem.id.z()
+

>From 521267d82726275f7dc460200326dabd60510b6e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 17:37:53 -0800
Subject: [PATCH 4/5] Remove newline

Change-Id: I670d272205b5431a1fc434abd94550747c49c15e
---
 llvm/test/CodeGen/AMDGPU/idot4s.ll | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 17182b20bfba7d..4262ec1057924a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3450,4 +3450,3 @@ entry:
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
-

>From 5343dd6ca665c02d95228c90b4a9dbecefa8cec8 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 6 Nov 2024 17:38:33 -0800
Subject: [PATCH 5/5] Add newline

Change-Id: If26584f3e25c5a1e4ec33ca71ac1d331eae24103
---
 llvm/test/CodeGen/AMDGPU/idot4s.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 4262ec1057924a..108d85e024ad76 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3449,4 +3449,5 @@ entry:
   ret void
 }
 
+
 declare i32 @llvm.amdgcn.workitem.id.x()



More information about the llvm-commits mailing list