[llvm] [AMDGPU] Fix typo in v_dot4 combine (PR #115224)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 14:04:35 PST 2024
================
@@ -3450,4 +3450,850 @@ entry:
}
+define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2) local_unnamed_addr {
+; GFX7-LABEL: ByteOffsetCorrectness:
+; GFX7: ; %bb.0: ; %.entry
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7-NEXT: s_cbranch_execz .LBB17_5
+; GFX7-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v2
+; GFX7-NEXT: v_mul_hi_u32_u24_e32 v2, 0x48, v0
+; GFX7-NEXT: v_mul_u32_u24_e32 v1, 0x48, v0
+; GFX7-NEXT: s_movk_i32 s0, 0x900
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v3, s0, v[1:2]
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 5, v3
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, v6, v0
+; GFX7-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, s11
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, s10, v4
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GFX7-NEXT: v_mov_b32_e32 v6, s9
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, s8, v4
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x48
+; GFX7-NEXT: s_movk_i32 s10, 0xffe1
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v7, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX7-NEXT: s_mov_b32 s11, -1
+; GFX7-NEXT: s_mov_b64 s[12:13], 0
+; GFX7-NEXT: .LBB17_2: ; %.lr.ph
+; GFX7-NEXT: ; =>This Loop Header: Depth=1
+; GFX7-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX7-NEXT: v_mov_b32_e32 v8, 0
+; GFX7-NEXT: s_mov_b64 s[0:1], s[8:9]
+; GFX7-NEXT: .LBB17_3: ; %.preheader2
+; GFX7-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT: buffer_load_sbyte v9, v[4:5], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_load_sbyte v10, v[4:5], s[0:3], 0 addr64 offset:1
+; GFX7-NEXT: buffer_load_sbyte v11, v[4:5], s[0:3], 0 addr64 offset:2
+; GFX7-NEXT: buffer_load_sbyte v12, v[4:5], s[0:3], 0 addr64 offset:3
+; GFX7-NEXT: buffer_load_sbyte v13, v[4:5], s[0:3], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_sbyte v14, v[4:5], s[0:3], 0 addr64 offset:5
+; GFX7-NEXT: buffer_load_sbyte v15, v[4:5], s[0:3], 0 addr64 offset:6
+; GFX7-NEXT: buffer_load_sbyte v16, v[4:5], s[0:3], 0 addr64 offset:7
+; GFX7-NEXT: buffer_load_sbyte v17, v[4:5], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT: buffer_load_sbyte v18, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_load_sbyte v19, v[0:1], s[0:3], 0 addr64 offset:1
+; GFX7-NEXT: buffer_load_sbyte v20, v[0:1], s[0:3], 0 addr64 offset:2
+; GFX7-NEXT: buffer_load_sbyte v21, v[0:1], s[0:3], 0 addr64 offset:3
+; GFX7-NEXT: buffer_load_sbyte v22, v[0:1], s[0:3], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_sbyte v23, v[0:1], s[0:3], 0 addr64 offset:5
+; GFX7-NEXT: buffer_load_sbyte v24, v[0:1], s[0:3], 0 addr64 offset:6
+; GFX7-NEXT: buffer_load_sbyte v25, v[0:1], s[0:3], 0 addr64 offset:7
+; GFX7-NEXT: buffer_load_sbyte v26, v[0:1], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT: s_add_u32 s0, s0, 9
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, s[0:1], v[6:7]
+; GFX7-NEXT: s_and_b64 vcc, exec, vcc
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_mad_i32_i24 v8, v18, v9, v8
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_mad_i32_i24 v8, v19, v10, v8
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mad_i32_i24 v8, v20, v11, v8
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_mad_i32_i24 v8, v21, v12, v8
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mad_i32_i24 v8, v22, v13, v8
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mad_i32_i24 v8, v23, v14, v8
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mad_i32_i24 v8, v24, v15, v8
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mad_i32_i24 v8, v25, v16, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v8, v26, v17, v8
+; GFX7-NEXT: s_cbranch_vccnz .LBB17_3
+; GFX7-NEXT: ; %bb.4: ; %.110
+; GFX7-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX7-NEXT: v_lshl_b64 v[9:10], v[2:3], 2
+; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], v[2:3]
+; GFX7-NEXT: buffer_store_dword v8, v[9:10], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v2
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x900, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v8
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX7-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13]
+; GFX7-NEXT: v_mov_b32_e32 v3, v9
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[12:13]
+; GFX7-NEXT: s_cbranch_execnz .LBB17_2
+; GFX7-NEXT: .LBB17_5: ; %._crit_edge
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: ByteOffsetCorrectness:
+; GFX8: ; %bb.0: ; %.entry
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB17_5
+; GFX8-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s0, 0x900
+; GFX8-NEXT: v_mul_hi_u32_u24_e32 v4, 0x900, v3
+; GFX8-NEXT: v_mul_u32_u24_e32 v3, 0x900, v3
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, s0, v[3:4]
+; GFX8-NEXT: s_movk_i32 s0, 0x48
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s0, v[1:2]
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 5, v5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
+; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, 0, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s6, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v4, v3, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v2
+; GFX8-NEXT: s_movk_i32 s4, 0xffe1
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v4, v3, vcc
+; GFX8-NEXT: s_mov_b32 s5, -1
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB17_2: ; %.lr.ph
+; GFX8-NEXT: ; =>This Loop Header: Depth=1
+; GFX8-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX8-NEXT: v_mov_b32_e32 v10, 0
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB17_3: ; %.preheader2
+; GFX8-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v8
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v9, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v6
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v11, v[4:5]
+; GFX8-NEXT: flat_load_sbyte v12, v[2:3]
+; GFX8-NEXT: s_add_u32 s0, s0, 9
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0x48
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v12, v11, v10
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 8, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v2
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v4, v[4:5]
+; GFX8-NEXT: flat_load_sbyte v2, v[2:3]
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_mad_i32_i24 v10, v10, v13, v12
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v10, v2, v4, v10
+; GFX8-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX8-NEXT: ; %bb.4: ; %.110
+; GFX8-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT: flat_store_dword v[2:3], v10
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x900, v6
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x900, v8
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB17_2
+; GFX8-NEXT: .LBB17_5: ; %._crit_edge
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-NODL-LABEL: ByteOffsetCorrectness:
+; GFX9-NODL: ; %bb.0: ; %.entry
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NODL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX9-NODL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NODL-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-NODL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX9-NODL-NEXT: v_add_u32_e32 v10, v3, v2
+; GFX9-NODL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX9-NODL-NEXT: s_movk_i32 s3, 0x900
+; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2
+; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0
+; GFX9-NODL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
+; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0
+; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0
+; GFX9-NODL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
+; GFX9-NODL-NEXT: s_movk_i32 s2, 0x48
+; GFX9-NODL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
+; GFX9-NODL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s9
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6
+; GFX9-NODL-NEXT: s_movk_i32 s6, 0xffe1
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX9-NODL-NEXT: s_mov_b32 s7, -1
+; GFX9-NODL-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NODL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX9-NODL-NEXT: ; =>This Loop Header: Depth=1
+; GFX9-NODL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NODL-NEXT: s_mov_b64 s[10:11], 0
+; GFX9-NODL-NEXT: .LBB17_3: ; %.preheader2
+; GFX9-NODL-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX9-NODL-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v12, s11
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s10, v6
+; GFX9-NODL-NEXT: v_add_co_u32_e64 v11, s[0:1], s10, v0
+; GFX9-NODL-NEXT: v_add_co_u32_e64 v13, s[2:3], s10, v2
+; GFX9-NODL-NEXT: v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
+; GFX9-NODL-NEXT: v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
+; GFX9-NODL-NEXT: global_load_sbyte v15, v[13:14], off
+; GFX9-NODL-NEXT: global_load_sbyte v16, v[11:12], off offset:1
+; GFX9-NODL-NEXT: global_load_sbyte v17, v[11:12], off offset:2
+; GFX9-NODL-NEXT: global_load_sbyte v18, v[11:12], off offset:3
+; GFX9-NODL-NEXT: global_load_sbyte v19, v[11:12], off offset:4
+; GFX9-NODL-NEXT: global_load_sbyte v20, v[11:12], off offset:5
+; GFX9-NODL-NEXT: global_load_sbyte v21, v[11:12], off offset:6
+; GFX9-NODL-NEXT: global_load_sbyte v22, v[11:12], off offset:7
+; GFX9-NODL-NEXT: global_load_sbyte v23, v[9:10], off
+; GFX9-NODL-NEXT: global_load_sbyte v24, v[9:10], off offset:1
+; GFX9-NODL-NEXT: global_load_sbyte v25, v[9:10], off offset:2
+; GFX9-NODL-NEXT: global_load_sbyte v26, v[9:10], off offset:3
+; GFX9-NODL-NEXT: global_load_sbyte v27, v[9:10], off offset:4
+; GFX9-NODL-NEXT: global_load_sbyte v28, v[9:10], off offset:5
+; GFX9-NODL-NEXT: global_load_sbyte v29, v[9:10], off offset:6
+; GFX9-NODL-NEXT: ; kill: killed $vgpr11 killed $vgpr12
+; GFX9-NODL-NEXT: global_load_sbyte v11, v[9:10], off offset:7
+; GFX9-NODL-NEXT: global_load_sbyte v12, v[13:14], off offset:8
+; GFX9-NODL-NEXT: global_load_sbyte v30, v[9:10], off offset:8
+; GFX9-NODL-NEXT: s_add_u32 s10, s10, 9
+; GFX9-NODL-NEXT: s_addc_u32 s11, s11, 0
+; GFX9-NODL-NEXT: s_cmp_lg_u64 s[10:11], 0x48
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v23, v15, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v24, v16, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v25, v17, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v26, v18, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v27, v19, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v28, v20, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v29, v21, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v11, v22, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v30, v12, v8
+; GFX9-NODL-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX9-NODL-NEXT: ; %bb.4: ; %.110
+; GFX9-NODL-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX9-NODL-NEXT: v_lshlrev_b64 v[9:10], 2, v[4:5]
+; GFX9-NODL-NEXT: v_mov_b32_e32 v11, s5
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s4, v9
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
+; GFX9-NODL-NEXT: global_store_dword v[9:10], v8, off
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v8, vcc, 32, v4
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, 0x900, v0
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, 0x900, v2
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NODL-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[4:5]
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, 0x900, v6
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, v8
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-NODL-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-NODL-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NODL-NEXT: s_cbranch_execnz .LBB17_2
+; GFX9-NODL-NEXT: .LBB17_5: ; %._crit_edge
+; GFX9-NODL-NEXT: s_endpgm
+;
+; GFX9-DL-LABEL: ByteOffsetCorrectness:
+; GFX9-DL: ; %bb.0: ; %.entry
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX9-DL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DL-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX9-DL-NEXT: v_add_u32_e32 v10, v3, v2
+; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX9-DL-NEXT: s_movk_i32 s3, 0x900
+; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2
+; GFX9-DL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0
+; GFX9-DL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
+; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0
+; GFX9-DL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
+; GFX9-DL-NEXT: s_movk_i32 s2, 0x48
+; GFX9-DL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
+; GFX9-DL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9
+; GFX9-DL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
+; GFX9-DL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6
+; GFX9-DL-NEXT: s_movk_i32 s8, 0xffe1
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX9-DL-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-DL-NEXT: s_mov_b32 s12, 0xc0c0400
+; GFX9-DL-NEXT: s_mov_b32 s9, -1
+; GFX9-DL-NEXT: s_mov_b32 s13, 0x4000c0c
+; GFX9-DL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX9-DL-NEXT: ; =>This Loop Header: Depth=1
+; GFX9-DL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-DL-NEXT: s_mov_b64 s[10:11], 0
+; GFX9-DL-NEXT: .LBB17_3: ; %.preheader2
+; GFX9-DL-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX9-DL-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX9-DL-NEXT: v_mov_b32_e32 v12, s11
+; GFX9-DL-NEXT: v_add_co_u32_e32 v9, vcc, s10, v6
+; GFX9-DL-NEXT: v_add_co_u32_e64 v11, s[0:1], s10, v0
+; GFX9-DL-NEXT: v_add_co_u32_e64 v13, s[2:3], s10, v2
+; GFX9-DL-NEXT: v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
+; GFX9-DL-NEXT: v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
+; GFX9-DL-NEXT: global_load_sbyte v15, v[11:12], off offset:1
+; GFX9-DL-NEXT: global_load_sbyte v16, v[11:12], off offset:2
+; GFX9-DL-NEXT: global_load_sbyte v17, v[11:12], off offset:3
+; GFX9-DL-NEXT: global_load_sbyte v18, v[11:12], off offset:4
+; GFX9-DL-NEXT: global_load_sbyte v19, v[11:12], off offset:5
+; GFX9-DL-NEXT: global_load_sbyte v20, v[11:12], off offset:6
+; GFX9-DL-NEXT: global_load_sbyte v21, v[11:12], off offset:7
+; GFX9-DL-NEXT: global_load_sbyte v22, v[13:14], off
+; GFX9-DL-NEXT: global_load_sbyte v23, v[13:14], off offset:8
+; GFX9-DL-NEXT: global_load_sbyte v24, v[9:10], off
+; GFX9-DL-NEXT: global_load_sbyte v25, v[9:10], off offset:1
+; GFX9-DL-NEXT: global_load_sbyte v26, v[9:10], off offset:2
+; GFX9-DL-NEXT: global_load_sbyte v27, v[9:10], off offset:3
+; GFX9-DL-NEXT: global_load_sbyte v28, v[9:10], off offset:4
+; GFX9-DL-NEXT: global_load_sbyte v29, v[9:10], off offset:5
+; GFX9-DL-NEXT: ; kill: killed $vgpr13 killed $vgpr14
+; GFX9-DL-NEXT: ; kill: killed $vgpr11 killed $vgpr12
+; GFX9-DL-NEXT: global_load_sbyte v11, v[9:10], off offset:6
+; GFX9-DL-NEXT: global_load_sbyte v12, v[9:10], off offset:7
+; GFX9-DL-NEXT: global_load_sbyte v13, v[9:10], off offset:8
+; GFX9-DL-NEXT: s_add_u32 s10, s10, 9
+; GFX9-DL-NEXT: s_addc_u32 s11, s11, 0
+; GFX9-DL-NEXT: s_cmp_lg_u64 s[10:11], 0x48
+; GFX9-DL-NEXT: s_waitcnt vmcnt(16)
+; GFX9-DL-NEXT: v_perm_b32 v9, v16, v15, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(14)
+; GFX9-DL-NEXT: v_perm_b32 v10, v18, v17, s13
+; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX9-DL-NEXT: s_waitcnt vmcnt(12)
+; GFX9-DL-NEXT: v_perm_b32 v16, v20, v19, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(9)
+; GFX9-DL-NEXT: v_perm_b32 v17, v23, v21, s13
+; GFX9-DL-NEXT: s_waitcnt vmcnt(8)
+; GFX9-DL-NEXT: v_mad_i32_i24 v8, v24, v22, v8
+; GFX9-DL-NEXT: s_waitcnt vmcnt(6)
+; GFX9-DL-NEXT: v_perm_b32 v14, v26, v25, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(4)
+; GFX9-DL-NEXT: v_perm_b32 v15, v28, v27, s13
+; GFX9-DL-NEXT: v_or_b32_e32 v10, v15, v14
+; GFX9-DL-NEXT: v_dot4_i32_i8 v8, v10, v9, v8
+; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
+; GFX9-DL-NEXT: v_perm_b32 v11, v11, v29, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_perm_b32 v12, v13, v12, s13
+; GFX9-DL-NEXT: v_or_b32_e32 v13, v17, v16
+; GFX9-DL-NEXT: v_or_b32_e32 v11, v12, v11
+; GFX9-DL-NEXT: v_dot4_i32_i8 v8, v11, v13, v8
+; GFX9-DL-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX9-DL-NEXT: ; %bb.4: ; %.110
+; GFX9-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX9-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[4:5]
+; GFX9-DL-NEXT: v_mov_b32_e32 v11, s5
+; GFX9-DL-NEXT: v_add_co_u32_e32 v9, vcc, s4, v9
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
+; GFX9-DL-NEXT: global_store_dword v[9:10], v8, off
+; GFX9-DL-NEXT: v_add_co_u32_e32 v8, vcc, 32, v4
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-DL-NEXT: v_add_co_u32_e32 v0, vcc, 0x900, v0
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-DL-NEXT: v_add_co_u32_e32 v2, vcc, 0x900, v2
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-DL-NEXT: v_cmp_lt_i64_e64 s[0:1], s[8:9], v[4:5]
+; GFX9-DL-NEXT: v_add_co_u32_e32 v6, vcc, 0x900, v6
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, v8
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-DL-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-DL-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-DL-NEXT: s_cbranch_execnz .LBB17_2
+; GFX9-DL-NEXT: .LBB17_5: ; %._crit_edge
+; GFX9-DL-NEXT: s_endpgm
+;
+; GFX10-DL-LABEL: ByteOffsetCorrectness:
+; GFX10-DL: ; %bb.0: ; %.entry
+; GFX10-DL-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-DL-NEXT: v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1]
+; GFX10-DL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX10-DL-NEXT: s_cbranch_execz .LBB17_5
+; GFX10-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX10-DL-NEXT: v_mul_hi_u32_u24_e32 v5, 0x900, v2
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, 0x900, v2
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v6, v3, v2
+; GFX10-DL-NEXT: v_mul_hi_u32_u24_e32 v2, 0x48, v0
+; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, 0x48, v0
+; GFX10-DL-NEXT: s_movk_i32 s2, 0xffe1
+; GFX10-DL-NEXT: v_mad_u64_u32 v[3:4], s0, 0x900, v3, v[4:5]
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 5, v6
+; GFX10-DL-NEXT: v_mad_u64_u32 v[6:7], s0, 0x900, v6, v[1:2]
+; GFX10-DL-NEXT: s_mov_b32 s3, -1
+; GFX10-DL-NEXT: s_mov_b32 s6, 0
+; GFX10-DL-NEXT: v_mad_u64_u32 v[4:5], s0, 0x48, v0, v[3:4]
+; GFX10-DL-NEXT: v_add_co_u32 v0, s0, v8, v0
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: v_add_co_u32 v2, vcc_lo, s8, v6
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s9, v7, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v4, vcc_lo, s8, v4
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s9, v5, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v6, vcc_lo, s10, v6
+; GFX10-DL-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, 0, s0
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s11, v7, vcc_lo
+; GFX10-DL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX10-DL-NEXT: ; =>This Loop Header: Depth=1
+; GFX10-DL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0
+; GFX10-DL-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-DL-NEXT: .LBB17_3: ; %.preheader2
+; GFX10-DL-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX10-DL-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX10-DL-NEXT: v_add_co_u32 v9, vcc_lo, v4, s0
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v11, vcc_lo, v2, s0
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v13, vcc_lo, v6, s0
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
+; GFX10-DL-NEXT: s_clause 0x6
+; GFX10-DL-NEXT: global_load_sbyte v15, v[11:12], off offset:1
+; GFX10-DL-NEXT: global_load_sbyte v16, v[11:12], off offset:2
+; GFX10-DL-NEXT: global_load_sbyte v17, v[11:12], off offset:3
+; GFX10-DL-NEXT: global_load_sbyte v18, v[11:12], off offset:4
+; GFX10-DL-NEXT: global_load_sbyte v19, v[11:12], off offset:5
+; GFX10-DL-NEXT: global_load_sbyte v20, v[11:12], off offset:6
+; GFX10-DL-NEXT: global_load_sbyte v21, v[11:12], off offset:7
+; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: global_load_sbyte v22, v[9:10], off
+; GFX10-DL-NEXT: global_load_sbyte v23, v[9:10], off offset:8
+; GFX10-DL-NEXT: s_clause 0x8
+; GFX10-DL-NEXT: global_load_sbyte v24, v[13:14], off
+; GFX10-DL-NEXT: global_load_sbyte v25, v[13:14], off offset:1
+; GFX10-DL-NEXT: global_load_sbyte v26, v[13:14], off offset:2
+; GFX10-DL-NEXT: global_load_sbyte v27, v[13:14], off offset:3
+; GFX10-DL-NEXT: global_load_sbyte v28, v[13:14], off offset:4
+; GFX10-DL-NEXT: global_load_sbyte v29, v[13:14], off offset:5
+; GFX10-DL-NEXT: ; meta instruction
+; GFX10-DL-NEXT: ; meta instruction
+; GFX10-DL-NEXT: global_load_sbyte v9, v[13:14], off offset:6
+; GFX10-DL-NEXT: global_load_sbyte v10, v[13:14], off offset:7
+; GFX10-DL-NEXT: global_load_sbyte v11, v[13:14], off offset:8
+; GFX10-DL-NEXT: s_add_u32 s0, s0, 9
+; GFX10-DL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-DL-NEXT: s_cmp_lg_u64 s[0:1], 0x48
+; GFX10-DL-NEXT: s_waitcnt vmcnt(16)
+; GFX10-DL-NEXT: v_perm_b32 v12, v16, v15, 0xc0c0400
+; GFX10-DL-NEXT: s_waitcnt vmcnt(14)
+; GFX10-DL-NEXT: v_perm_b32 v13, v18, v17, 0x4000c0c
+; GFX10-DL-NEXT: s_waitcnt vmcnt(12)
+; GFX10-DL-NEXT: v_perm_b32 v16, v20, v19, 0xc0c0400
+; GFX10-DL-NEXT: s_waitcnt vmcnt(9)
+; GFX10-DL-NEXT: v_perm_b32 v17, v23, v21, 0x4000c0c
+; GFX10-DL-NEXT: s_waitcnt vmcnt(8)
+; GFX10-DL-NEXT: v_mad_i32_i24 v8, v24, v22, v8
+; GFX10-DL-NEXT: s_waitcnt vmcnt(6)
+; GFX10-DL-NEXT: v_perm_b32 v14, v26, v25, 0xc0c0400
+; GFX10-DL-NEXT: s_waitcnt vmcnt(4)
+; GFX10-DL-NEXT: v_perm_b32 v15, v28, v27, 0x4000c0c
+; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
+; GFX10-DL-NEXT: v_perm_b32 v9, v9, v29, 0xc0c0400
+; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_perm_b32 v10, v11, v10, 0x4000c0c
+; GFX10-DL-NEXT: v_or_b32_e32 v11, v13, v12
+; GFX10-DL-NEXT: v_or_b32_e32 v12, v15, v14
+; GFX10-DL-NEXT: v_or_b32_e32 v13, v17, v16
+; GFX10-DL-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX10-DL-NEXT: v_dot4c_i32_i8 v8, v12, v11
+; GFX10-DL-NEXT: v_dot4c_i32_i8 v8, v9, v13
+; GFX10-DL-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX10-DL-NEXT: ; %bb.4: ; %.110
+; GFX10-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX10-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[0:1]
+; GFX10-DL-NEXT: v_add_co_u32 v6, s0, 0x900, v6
+; GFX10-DL-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX10-DL-NEXT: v_add_co_u32 v9, vcc_lo, s4, v9
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s5, v10, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 32
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v2, vcc_lo, 0x900, v2
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-DL-NEXT: v_add_co_u32 v4, vcc_lo, 0x900, v4
+; GFX10-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX10-DL-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, v11
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, v12
+; GFX10-DL-NEXT: global_store_dword v[9:10], v8, off
+; GFX10-DL-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-DL-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
+; GFX10-DL-NEXT: s_cbranch_execnz .LBB17_2
+; GFX10-DL-NEXT: .LBB17_5: ; %._crit_edge
+; GFX10-DL-NEXT: s_endpgm
+;
+; GFX11-DL-LABEL: ByteOffsetCorrectness:
+; GFX11-DL: ; %bb.0: ; %.entry
+; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-DL-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT: v_cmpx_gt_i64_e32 2, v[1:2]
+; GFX11-DL-NEXT: s_cbranch_execz .LBB17_5
+; GFX11-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 20, 10
+; GFX11-DL-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-DL-NEXT: v_bfe_u32 v0, v0, 10, 10
+; GFX11-DL-NEXT: v_mul_hi_u32_u24_e32 v4, 0x48, v1
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[2:3], 0x34
+; GFX11-DL-NEXT: v_mul_hi_u32_u24_e32 v3, 0x900, v5
+; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, 0x900, v5
+; GFX11-DL-NEXT: v_add_nc_u32_e32 v9, v0, v5
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT: v_mad_u64_u32 v[5:6], null, 0x900, v0, v[2:3]
+; GFX11-DL-NEXT: v_mul_u32_u24_e32 v3, 0x48, v1
+; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 5, v9
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT: v_mad_u64_u32 v[7:8], null, 0x900, v9, v[3:4]
+; GFX11-DL-NEXT: v_mad_u64_u32 v[9:10], null, 0x48, v1, v[5:6]
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT: v_add_co_u32 v0, s0, v0, v1
+; GFX11-DL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-DL-NEXT: v_add_co_u32 v2, vcc_lo, s4, v7
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v8, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v4, vcc_lo, s4, v9
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v10, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v6, vcc_lo, s6, v7
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s7, v8, vcc_lo
+; GFX11-DL-NEXT: s_movk_i32 s4, 0xffe1
+; GFX11-DL-NEXT: s_mov_b32 s5, -1
+; GFX11-DL-NEXT: s_mov_b32 s6, 0
+; GFX11-DL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX11-DL-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-DL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX11-DL-NEXT: v_mov_b32_e32 v8, 0
+; GFX11-DL-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-DL-NEXT: .LBB17_3: ; %.preheader2
+; GFX11-DL-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX11-DL-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-DL-NEXT: v_add_co_u32 v9, vcc_lo, v4, s0
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v5, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v11, vcc_lo, v2, s0
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s1, v3, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v13, vcc_lo, v6, s0
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, s1, v7, vcc_lo
+; GFX11-DL-NEXT: s_clause 0x6
+; GFX11-DL-NEXT: global_load_i8 v15, v[11:12], off offset:1
+; GFX11-DL-NEXT: global_load_i8 v16, v[11:12], off offset:2
+; GFX11-DL-NEXT: global_load_i8 v17, v[11:12], off offset:3
+; GFX11-DL-NEXT: global_load_i8 v18, v[11:12], off offset:4
+; GFX11-DL-NEXT: global_load_i8 v19, v[11:12], off offset:5
+; GFX11-DL-NEXT: global_load_i8 v20, v[11:12], off offset:6
+; GFX11-DL-NEXT: global_load_i8 v11, v[11:12], off offset:7
+; GFX11-DL-NEXT: s_clause 0x1
+; GFX11-DL-NEXT: global_load_i8 v12, v[9:10], off
+; GFX11-DL-NEXT: global_load_i8 v9, v[9:10], off offset:8
+; GFX11-DL-NEXT: s_clause 0x8
+; GFX11-DL-NEXT: global_load_i8 v10, v[13:14], off
+; GFX11-DL-NEXT: global_load_i8 v21, v[13:14], off offset:1
+; GFX11-DL-NEXT: global_load_i8 v22, v[13:14], off offset:2
+; GFX11-DL-NEXT: global_load_i8 v23, v[13:14], off offset:3
+; GFX11-DL-NEXT: global_load_i8 v24, v[13:14], off offset:4
+; GFX11-DL-NEXT: global_load_i8 v25, v[13:14], off offset:5
+; GFX11-DL-NEXT: global_load_i8 v26, v[13:14], off offset:6
+; GFX11-DL-NEXT: global_load_i8 v27, v[13:14], off offset:7
+; GFX11-DL-NEXT: global_load_i8 v13, v[13:14], off offset:8
+; GFX11-DL-NEXT: s_add_u32 s0, s0, 9
+; GFX11-DL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-DL-NEXT: s_cmp_lg_u64 s[0:1], 0x48
+; GFX11-DL-NEXT: s_waitcnt vmcnt(9)
+; GFX11-DL-NEXT: v_perm_b32 v9, v9, v11, 0x4000c0c
+; GFX11-DL-NEXT: s_waitcnt vmcnt(8)
+; GFX11-DL-NEXT: v_mad_i32_i24 v8, v10, v12, v8
+; GFX11-DL-NEXT: v_perm_b32 v10, v16, v15, 0xc0c0400
+; GFX11-DL-NEXT: v_perm_b32 v12, v18, v17, 0x4000c0c
+; GFX11-DL-NEXT: s_waitcnt vmcnt(6)
+; GFX11-DL-NEXT: v_perm_b32 v14, v22, v21, 0xc0c0400
+; GFX11-DL-NEXT: s_waitcnt vmcnt(4)
+; GFX11-DL-NEXT: v_perm_b32 v15, v24, v23, 0x4000c0c
+; GFX11-DL-NEXT: v_perm_b32 v16, v20, v19, 0xc0c0400
+; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT: v_perm_b32 v11, v26, v25, 0xc0c0400
+; GFX11-DL-NEXT: v_or_b32_e32 v10, v12, v10
+; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT: v_perm_b32 v13, v13, v27, 0x4000c0c
+; GFX11-DL-NEXT: v_or_b32_e32 v12, v15, v14
+; GFX11-DL-NEXT: v_or_b32_e32 v9, v9, v16
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT: v_or_b32_e32 v11, v13, v11
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v8, v12, v10, v8 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v8, v11, v9, v8 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX11-DL-NEXT: ; %bb.4: ; %.110
+; GFX11-DL-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX11-DL-NEXT: v_lshlrev_b64 v[9:10], 2, v[0:1]
+; GFX11-DL-NEXT: v_add_co_u32 v6, s0, 0x900, v6
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT: v_add_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX11-DL-NEXT: v_add_co_u32 v9, vcc_lo, s2, v9
+; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 32
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v1, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v2, vcc_lo, 0x900, v2
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-DL-NEXT: v_add_co_u32 v4, vcc_lo, 0x900, v4
+; GFX11-DL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-DL-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-DL-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-DL-NEXT: global_store_b32 v[9:10], v8, off
+; GFX11-DL-NEXT: s_or_b32 s6, vcc_lo, s6
+; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-DL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s6
+; GFX11-DL-NEXT: s_cbranch_execnz .LBB17_2
+; GFX11-DL-NEXT: .LBB17_5: ; %._crit_edge
+; GFX11-DL-NEXT: s_endpgm
+.entry:
+ %workitemx = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %sworkitemx = sext i32 %workitemx to i64
+ %workitemy = tail call i32 @llvm.amdgcn.workitem.id.y()
+ %sworkitemy = sext i32 %workitemy to i64
+ %workitemz = tail call i32 @llvm.amdgcn.workitem.id.z()
+ %sworkitemz = sext i32 %workitemz to i64
+ %ivtemp0 = add nsw i64 %sworkitemy, %sworkitemz
+ %ivtemp1 = shl nsw i64 %ivtemp0, 5
+ %iv = add nsw i64 %ivtemp1, %sworkitemx
+ %cmp = icmp slt i64 %sworkitemx, 2
+ br i1 %cmp, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %.entry, %.110
+ %phi = phi i64 [ %outerlimit, %.110 ], [ %iv, %.entry ]
+ %outptr = getelementptr i32, ptr addrspace(1) %inptr2, i64 %phi
+ %scalarmul = mul nsw i64 %phi, 72
+ br label %.preheader2
+
+.preheader2: ; preds = %.lr.ph, %.preheader2
+ %phi1 = phi i64 [ 0, %.lr.ph ], [ %limit, %.preheader2 ]
+ %.lcssa4.lcssa67 = phi i32 [ 0, %.lr.ph ], [ %ivadd9, %.preheader2 ]
+ %mul0 = mul nuw nsw i64 %phi1, 9
+ %scalaradd = add nsw i64 %mul0, %scalarmul
+ %gep10 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %scalaradd
+ %l10 = load i8, ptr addrspace(1) %gep10, align 1
+ %gep11 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %scalaradd
+ %l11 = load i8, ptr addrspace(1) %gep11, align 1
+ %op11 = sext i8 %l10 to i32
+ %op10 = sext i8 %l11 to i32
+ %mul1 = mul nsw i32 %op10, %op11
+ %ivadd1 = add i32 %mul1, %.lcssa4.lcssa67
+ %off2 = add nsw i64 %scalaradd, 1
+ %gep21 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off2
+ %l21 = load i8, ptr addrspace(1) %gep21, align 1
+ %gep20 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off2
+ %l20 = load i8, ptr addrspace(1) %gep20, align 1
+ %op21 = sext i8 %l21 to i32
+ %op20 = sext i8 %l20 to i32
+ %mul2 = mul nsw i32 %op20, %op21
+ %ivadd2 = add i32 %mul2, %ivadd1
+ %off3 = add nsw i64 %scalaradd, 2
+ %gep31 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off3
+ %l31 = load i8, ptr addrspace(1) %gep31, align 1
+ %gep30 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off3
+ %l30 = load i8, ptr addrspace(1) %gep30, align 1
+ %op31 = sext i8 %l31 to i32
+ %op30 = sext i8 %l30 to i32
+ %mul3 = mul nsw i32 %op30, %op31
+ %ivadd3 = add i32 %mul3, %ivadd2
+ %off4 = add nsw i64 %scalaradd, 3
+ %gep41 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off4
+ %l41 = load i8, ptr addrspace(1) %gep41, align 1
+ %gep40 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off4
+ %l40 = load i8, ptr addrspace(1) %gep40, align 1
+ %op41 = sext i8 %l41 to i32
+ %op40 = sext i8 %l40 to i32
+ %mul4 = mul nsw i32 %op40, %op41
+ %ivadd4 = add i32 %mul4, %ivadd3
+ %off5 = add nsw i64 %scalaradd, 4
+ %gep51 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off5
+ %l51 = load i8, ptr addrspace(1) %gep51, align 1
+ %gep50 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off5
+ %l50 = load i8, ptr addrspace(1) %gep50, align 1
+ %op51 = sext i8 %l51 to i32
+ %op50 = sext i8 %l50 to i32
+ %mul5 = mul nsw i32 %op50, %op51
+ %ivadd5 = add i32 %mul5, %ivadd4
+ %off6 = add nsw i64 %scalaradd, 5
+ %gep61 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off6
+ %l61 = load i8, ptr addrspace(1) %gep61, align 1
+ %gep60 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off6
+ %l60 = load i8, ptr addrspace(1) %gep60, align 1
+ %op61 = sext i8 %l61 to i32
+ %op60 = sext i8 %l60 to i32
+ %mul6 = mul nsw i32 %op60, %op61
+ %ivadd6 = add i32 %mul6, %ivadd5
+ %off7 = add nsw i64 %scalaradd, 6
+ %gep71 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off7
+ %l71 = load i8, ptr addrspace(1) %gep71, align 1
+ %gep70 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off7
+ %l70 = load i8, ptr addrspace(1) %gep70, align 1
+ %op71 = sext i8 %l71 to i32
+ %op70 = sext i8 %l70 to i32
+ %mul7 = mul nsw i32 %op70, %op71
+ %ivadd7 = add i32 %mul7, %ivadd6
+ %off8 = add nsw i64 %scalaradd, 7
+ %gep81 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off8
+ %l81 = load i8, ptr addrspace(1) %gep81, align 1
+ %gep80 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off8
+ %l80 = load i8, ptr addrspace(1) %gep80, align 1
+ %op81 = sext i8 %l81 to i32
+ %op80 = sext i8 %l80 to i32
+ %mul8 = mul nsw i32 %op80, %op81
+ %ivadd8 = add i32 %mul8, %ivadd7
+ %off9 = add nsw i64 %scalaradd, 8
+ %gep91 = getelementptr i8, ptr addrspace(1) %inptr0, i64 %off9
+ %l91 = load i8, ptr addrspace(1) %gep91, align 1
+ %gep90 = getelementptr i8, ptr addrspace(1) %inptr1, i64 %off9
+ %l90 = load i8, ptr addrspace(1) %gep90, align 1
+ %op91 = sext i8 %l91 to i32
+ %op90 = sext i8 %l90 to i32
+ %mul9 = mul nsw i32 %op90, %op91
+ %ivadd9 = add i32 %mul9, %ivadd8
+ %limit = add nuw nsw i64 %phi1, 1
+ %exitcond.not = icmp eq i64 %limit, 8
+ br i1 %exitcond.not, label %.110, label %.preheader2
+
+.110: ; preds = %.preheader2
----------------
arsenm wrote:
Shouldn't need control flow
https://github.com/llvm/llvm-project/pull/115224
More information about the llvm-commits
mailing list