[llvm] [AMDGPU] Add MaxMemoryClauseSchedStrategy (PR #114957)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 5 00:34:00 PST 2024
https://github.com/ruiling updated https://github.com/llvm/llvm-project/pull/114957
>From 3f22c213bb04e283a269ea0fa585709084d8fe93 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Tue, 5 Nov 2024 11:24:50 +0800
Subject: [PATCH 1/8] [AMDGPU] Add a test for grouping image instructions
This is a motiviting example that drives us to do better on grouping
image sample instructions.
---
.../AMDGPU/group-image-instructions.ll | 485 ++++++++++++++++++
1 file changed, 485 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
new file mode 100644
index 00000000000000..25ceb296ddc5fe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -0,0 +1,485 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
+; GFX11-LABEL: group_image_sample:
+; GFX11: ; %bb.0: ; %.entry
+; GFX11-NEXT: s_mov_b64 s[16:17], exec
+; GFX11-NEXT: s_wqm_b64 exec, exec
+; GFX11-NEXT: s_mov_b32 m0, s4
+; GFX11-NEXT: s_getpc_b64 s[4:5]
+; GFX11-NEXT: s_mov_b32 s0, s1
+; GFX11-NEXT: s_mov_b32 s6, s3
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: s_mov_b32 s3, s5
+; GFX11-NEXT: s_mov_b32 s7, s5
+; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x0
+; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
+; GFX11-NEXT: s_load_b256 s[0:7], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b64 s[18:19], exec
+; GFX11-NEXT: s_wqm_b64 exec, exec
+; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15
+; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
+; GFX11-NEXT: s_mov_b64 exec, s[18:19]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x10
+; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x20
+; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x30
+; GFX11-NEXT: v_interp_p10_f32 v4, v3, v0, v3 wait_exp:0
+; GFX11-NEXT: v_interp_p10_f32 v0, v2, v0, v2 wait_exp:7
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x40
+; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0x50
+; GFX11-NEXT: v_interp_p2_f32 v42, v3, v1, v4 wait_exp:7
+; GFX11-NEXT: v_interp_p2_f32 v43, v2, v1, v0 wait_exp:7
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v0, s18, v42
+; GFX11-NEXT: v_add_f32_e32 v1, s19, v43
+; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x60
+; GFX11-NEXT: v_add_f32_e32 v8, s20, v42
+; GFX11-NEXT: v_add_f32_e32 v9, s21, v43
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample v[4:7], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_add_f32_e32 v0, s22, v42
+; GFX11-NEXT: v_add_f32_e32 v1, s23, v43
+; GFX11-NEXT: v_add_f32_e32 v20, s26, v42
+; GFX11-NEXT: v_add_f32_e32 v21, s27, v43
+; GFX11-NEXT: image_sample v[12:15], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_add_f32_e32 v0, s24, v42
+; GFX11-NEXT: v_add_f32_e32 v1, s25, v43
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x70
+; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x80
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample v[16:19], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v0, s18, v42
+; GFX11-NEXT: v_add_f32_e32 v1, s19, v43
+; GFX11-NEXT: image_sample v[24:27], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x90
+; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0xa0
+; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0xb0
+; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0xc0
+; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0xd0
+; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0xe0
+; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xf0
+; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
+; GFX11-NEXT: v_add_f32_e32 v0, s20, v42
+; GFX11-NEXT: v_add_f32_e32 v1, s21, v43
+; GFX11-NEXT: v_add_f32_e32 v32, s22, v42
+; GFX11-NEXT: v_add_f32_e32 v33, s23, v43
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v36, s18, v42
+; GFX11-NEXT: v_add_f32_e32 v37, s19, v43
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample v[28:31], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_add_f32_e32 v0, s26, v42
+; GFX11-NEXT: v_add_f32_e32 v1, s27, v43
+; GFX11-NEXT: v_add_f32_e32 v38, s24, v42
+; GFX11-NEXT: v_add_f32_e32 v39, s25, v43
+; GFX11-NEXT: v_add_f32_e32 v40, s28, v42
+; GFX11-NEXT: v_add_f32_e32 v41, s29, v43
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_add_f32_e32 v44, v8, v4
+; GFX11-NEXT: v_add_f32_e32 v45, v9, v5
+; GFX11-NEXT: v_add_f32_e32 v46, v10, v6
+; GFX11-NEXT: v_add_f32_e32 v47, v11, v7
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample v[4:7], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[8:11], v[38:39], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_add_f32_e32 v44, v12, v44
+; GFX11-NEXT: v_add_f32_e32 v45, v13, v45
+; GFX11-NEXT: v_add_f32_e32 v46, v14, v46
+; GFX11-NEXT: v_add_f32_e32 v47, v15, v47
+; GFX11-NEXT: image_sample v[12:15], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_add_f32_e32 v36, s30, v42
+; GFX11-NEXT: v_add_f32_e32 v37, s31, v43
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_add_f32_e32 v0, v16, v44
+; GFX11-NEXT: v_add_f32_e32 v1, v17, v45
+; GFX11-NEXT: v_add_f32_e32 v44, v18, v46
+; GFX11-NEXT: v_add_f32_e32 v45, v19, v47
+; GFX11-NEXT: image_sample v[16:19], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_add_f32_e32 v46, v20, v0
+; GFX11-NEXT: v_add_f32_e32 v47, v21, v1
+; GFX11-NEXT: v_add_f32_e32 v44, v22, v44
+; GFX11-NEXT: v_add_f32_e32 v45, v23, v45
+; GFX11-NEXT: image_sample v[20:23], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_add_f32_e32 v38, s34, v42
+; GFX11-NEXT: v_add_f32_e32 v39, s35, v43
+; GFX11-NEXT: v_add_f32_e32 v0, s36, v42
+; GFX11-NEXT: v_add_f32_e32 v1, s37, v43
+; GFX11-NEXT: v_add_f32_e32 v40, s12, v42
+; GFX11-NEXT: v_add_f32_e32 v41, s13, v43
+; GFX11-NEXT: s_and_b64 exec, exec, s[16:17]
+; GFX11-NEXT: image_sample v[36:39], v[38:39], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: v_add_f32_e32 v46, v24, v46
+; GFX11-NEXT: v_add_f32_e32 v47, v25, v47
+; GFX11-NEXT: v_add_f32_e32 v44, v26, v44
+; GFX11-NEXT: v_add_f32_e32 v45, v27, v45
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample v[24:27], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_add_f32_e32 v0, v28, v46
+; GFX11-NEXT: v_add_f32_e32 v1, v29, v47
+; GFX11-NEXT: v_add_f32_e32 v28, v30, v44
+; GFX11-NEXT: v_add_f32_e32 v29, v31, v45
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: v_add_f32_e32 v0, v32, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v33, v1
+; GFX11-NEXT: v_add_f32_e32 v28, v34, v28
+; GFX11-NEXT: v_add_f32_e32 v29, v35, v29
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v5, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v6, v28
+; GFX11-NEXT: v_add_f32_e32 v5, v7, v29
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_add_f32_e32 v0, v8, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v9, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v11, v5
+; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: v_add_f32_e32 v0, v12, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v13, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v14, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v15, v5
+; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: v_add_f32_e32 v0, v16, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v17, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v18, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v19, v5
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_add_f32_e32 v0, v20, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v21, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v22, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v23, v5
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_add_f32_e32 v0, v36, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v37, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v38, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v39, v5
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_f32_e32 v0, v24, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v25, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v26, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v27, v5
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v0, v40, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v41, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v42, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v43, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v4, v5
+; GFX11-NEXT: exp mrt0 v0, v1, off, off done
+; GFX11-NEXT: s_endpgm
+.entry:
+ %0 = call i64 @llvm.amdgcn.s.getpc()
+ %1 = and i64 %0, -4294967296
+ %2 = zext i32 %userdata6 to i64
+ %3 = or disjoint i64 %1, %2
+ %4 = inttoptr i64 %3 to ptr addrspace(4)
+ %5 = load <4 x i32>, ptr addrspace(4) %4, align 16
+ %6 = zext i32 %userdata7 to i64
+ %7 = or disjoint i64 %1, %6
+ %8 = inttoptr i64 %7 to ptr addrspace(4)
+ %9 = load <4 x i32>, ptr addrspace(4) %8, align 4, !invariant.load !0
+ %10 = zext i32 %userdata8 to i64
+ %11 = or disjoint i64 %1, %10
+ %12 = inttoptr i64 %11 to ptr addrspace(4)
+ %13 = load <8 x i32>, ptr addrspace(4) %12, align 4, !invariant.load !0
+ %14 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %PrimMask)
+ %PerspInterpCenter.i1 = extractelement <2 x float> %PerspInterpCenter, i64 1
+ %PerspInterpCenter.i0 = extractelement <2 x float> %PerspInterpCenter, i64 0
+ %15 = call float @llvm.amdgcn.interp.inreg.p10(float %14, float %PerspInterpCenter.i0, float %14)
+ %16 = call float @llvm.amdgcn.interp.inreg.p2(float %14, float %PerspInterpCenter.i1, float %15)
+ %17 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %PrimMask)
+ %18 = call float @llvm.amdgcn.interp.inreg.p10(float %17, float %PerspInterpCenter.i0, float %17)
+ %19 = call float @llvm.amdgcn.interp.inreg.p2(float %17, float %PerspInterpCenter.i1, float %18)
+ %20 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 16, i32 0), !invariant.load !0
+ %21 = shufflevector <2 x i32> %20, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %22 = bitcast <4 x i32> %21 to <4 x float>
+ %.i0 = extractelement <4 x float> %22, i64 0
+ %.i1 = extractelement <4 x float> %22, i64 1
+ %.i03 = fadd reassoc nnan nsz arcp contract afn float %.i0, %19
+ %.i14 = fadd reassoc nnan nsz arcp contract afn float %.i1, %16
+ %23 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i03, float %.i14, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i010 = extractelement <4 x float> %23, i64 0
+ %.i113 = extractelement <4 x float> %23, i64 1
+ %.i215 = extractelement <4 x float> %23, i64 2
+ %.i317 = extractelement <4 x float> %23, i64 3
+ %24 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 32, i32 0), !invariant.load !0
+ %25 = shufflevector <2 x i32> %24, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %26 = bitcast <4 x i32> %25 to <4 x float>
+ %.i05 = extractelement <4 x float> %26, i64 0
+ %.i16 = extractelement <4 x float> %26, i64 1
+ %.i07 = fadd reassoc nnan nsz arcp contract afn float %.i05, %19
+ %.i18 = fadd reassoc nnan nsz arcp contract afn float %.i16, %16
+ %27 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i07, float %.i18, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i09 = extractelement <4 x float> %27, i64 0
+ %.i011 = fadd reassoc nnan nsz arcp contract afn float %.i09, %.i010
+ %.i112 = extractelement <4 x float> %27, i64 1
+ %.i114 = fadd reassoc nnan nsz arcp contract afn float %.i112, %.i113
+ %.i2 = extractelement <4 x float> %27, i64 2
+ %.i216 = fadd reassoc nnan nsz arcp contract afn float %.i2, %.i215
+ %.i3 = extractelement <4 x float> %27, i64 3
+ %.i318 = fadd reassoc nnan nsz arcp contract afn float %.i3, %.i317
+ %28 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 48, i32 0), !invariant.load !0
+ %29 = shufflevector <2 x i32> %28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %30 = bitcast <4 x i32> %29 to <4 x float>
+ %.i019 = extractelement <4 x float> %30, i64 0
+ %.i120 = extractelement <4 x float> %30, i64 1
+ %.i021 = fadd reassoc nnan nsz arcp contract afn float %.i019, %19
+ %.i122 = fadd reassoc nnan nsz arcp contract afn float %.i120, %16
+ %31 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i021, float %.i122, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i023 = extractelement <4 x float> %31, i64 0
+ %.i024 = fadd reassoc nnan nsz arcp contract afn float %.i023, %.i011
+ %.i125 = extractelement <4 x float> %31, i64 1
+ %.i126 = fadd reassoc nnan nsz arcp contract afn float %.i125, %.i114
+ %.i227 = extractelement <4 x float> %31, i64 2
+ %.i228 = fadd reassoc nnan nsz arcp contract afn float %.i227, %.i216
+ %.i329 = extractelement <4 x float> %31, i64 3
+ %.i330 = fadd reassoc nnan nsz arcp contract afn float %.i329, %.i318
+ %32 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 64, i32 0), !invariant.load !0
+ %33 = shufflevector <2 x i32> %32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %34 = bitcast <4 x i32> %33 to <4 x float>
+ %.i031 = extractelement <4 x float> %34, i64 0
+ %.i132 = extractelement <4 x float> %34, i64 1
+ %.i033 = fadd reassoc nnan nsz arcp contract afn float %.i031, %19
+ %.i134 = fadd reassoc nnan nsz arcp contract afn float %.i132, %16
+ %35 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i033, float %.i134, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i035 = extractelement <4 x float> %35, i64 0
+ %.i036 = fadd reassoc nnan nsz arcp contract afn float %.i035, %.i024
+ %.i137 = extractelement <4 x float> %35, i64 1
+ %.i138 = fadd reassoc nnan nsz arcp contract afn float %.i137, %.i126
+ %.i239 = extractelement <4 x float> %35, i64 2
+ %.i240 = fadd reassoc nnan nsz arcp contract afn float %.i239, %.i228
+ %.i341 = extractelement <4 x float> %35, i64 3
+ %.i342 = fadd reassoc nnan nsz arcp contract afn float %.i341, %.i330
+ %36 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 80, i32 0), !invariant.load !0
+ %37 = shufflevector <2 x i32> %36, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %38 = bitcast <4 x i32> %37 to <4 x float>
+ %.i043 = extractelement <4 x float> %38, i64 0
+ %.i144 = extractelement <4 x float> %38, i64 1
+ %.i045 = fadd reassoc nnan nsz arcp contract afn float %.i043, %19
+ %.i146 = fadd reassoc nnan nsz arcp contract afn float %.i144, %16
+ %39 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i045, float %.i146, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i047 = extractelement <4 x float> %39, i64 0
+ %.i048 = fadd reassoc nnan nsz arcp contract afn float %.i047, %.i036
+ %.i149 = extractelement <4 x float> %39, i64 1
+ %.i150 = fadd reassoc nnan nsz arcp contract afn float %.i149, %.i138
+ %.i251 = extractelement <4 x float> %39, i64 2
+ %.i252 = fadd reassoc nnan nsz arcp contract afn float %.i251, %.i240
+ %.i353 = extractelement <4 x float> %39, i64 3
+ %.i354 = fadd reassoc nnan nsz arcp contract afn float %.i353, %.i342
+ %40 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 96, i32 0), !invariant.load !0
+ %41 = shufflevector <2 x i32> %40, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %42 = bitcast <4 x i32> %41 to <4 x float>
+ %.i055 = extractelement <4 x float> %42, i64 0
+ %.i156 = extractelement <4 x float> %42, i64 1
+ %.i057 = fadd reassoc nnan nsz arcp contract afn float %.i055, %19
+ %.i158 = fadd reassoc nnan nsz arcp contract afn float %.i156, %16
+ %43 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i057, float %.i158, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i059 = extractelement <4 x float> %43, i64 0
+ %.i060 = fadd reassoc nnan nsz arcp contract afn float %.i059, %.i048
+ %.i161 = extractelement <4 x float> %43, i64 1
+ %.i162 = fadd reassoc nnan nsz arcp contract afn float %.i161, %.i150
+ %.i263 = extractelement <4 x float> %43, i64 2
+ %.i264 = fadd reassoc nnan nsz arcp contract afn float %.i263, %.i252
+ %.i365 = extractelement <4 x float> %43, i64 3
+ %.i366 = fadd reassoc nnan nsz arcp contract afn float %.i365, %.i354
+ %44 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 112, i32 0), !invariant.load !0
+ %45 = shufflevector <2 x i32> %44, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %46 = bitcast <4 x i32> %45 to <4 x float>
+ %.i067 = extractelement <4 x float> %46, i64 0
+ %.i168 = extractelement <4 x float> %46, i64 1
+ %.i069 = fadd reassoc nnan nsz arcp contract afn float %.i067, %19
+ %.i170 = fadd reassoc nnan nsz arcp contract afn float %.i168, %16
+ %47 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i069, float %.i170, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i071 = extractelement <4 x float> %47, i64 0
+ %.i072 = fadd reassoc nnan nsz arcp contract afn float %.i071, %.i060
+ %.i173 = extractelement <4 x float> %47, i64 1
+ %.i174 = fadd reassoc nnan nsz arcp contract afn float %.i173, %.i162
+ %.i275 = extractelement <4 x float> %47, i64 2
+ %.i276 = fadd reassoc nnan nsz arcp contract afn float %.i275, %.i264
+ %.i377 = extractelement <4 x float> %47, i64 3
+ %.i378 = fadd reassoc nnan nsz arcp contract afn float %.i377, %.i366
+ %48 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 128, i32 0), !invariant.load !0
+ %49 = shufflevector <2 x i32> %48, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %50 = bitcast <4 x i32> %49 to <4 x float>
+ %.i079 = extractelement <4 x float> %50, i64 0
+ %.i180 = extractelement <4 x float> %50, i64 1
+ %.i081 = fadd reassoc nnan nsz arcp contract afn float %.i079, %19
+ %.i182 = fadd reassoc nnan nsz arcp contract afn float %.i180, %16
+ %51 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i081, float %.i182, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i083 = extractelement <4 x float> %51, i64 0
+ %.i084 = fadd reassoc nnan nsz arcp contract afn float %.i083, %.i072
+ %.i185 = extractelement <4 x float> %51, i64 1
+ %.i186 = fadd reassoc nnan nsz arcp contract afn float %.i185, %.i174
+ %.i287 = extractelement <4 x float> %51, i64 2
+ %.i288 = fadd reassoc nnan nsz arcp contract afn float %.i287, %.i276
+ %.i389 = extractelement <4 x float> %51, i64 3
+ %.i390 = fadd reassoc nnan nsz arcp contract afn float %.i389, %.i378
+ %52 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 144, i32 0), !invariant.load !0
+ %53 = shufflevector <2 x i32> %52, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %54 = bitcast <4 x i32> %53 to <4 x float>
+ %.i091 = extractelement <4 x float> %54, i64 0
+ %.i192 = extractelement <4 x float> %54, i64 1
+ %.i093 = fadd reassoc nnan nsz arcp contract afn float %.i091, %19
+ %.i194 = fadd reassoc nnan nsz arcp contract afn float %.i192, %16
+ %55 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i093, float %.i194, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i095 = extractelement <4 x float> %55, i64 0
+ %.i096 = fadd reassoc nnan nsz arcp contract afn float %.i095, %.i084
+ %.i197 = extractelement <4 x float> %55, i64 1
+ %.i198 = fadd reassoc nnan nsz arcp contract afn float %.i197, %.i186
+ %.i299 = extractelement <4 x float> %55, i64 2
+ %.i2100 = fadd reassoc nnan nsz arcp contract afn float %.i299, %.i288
+ %.i3101 = extractelement <4 x float> %55, i64 3
+ %.i3102 = fadd reassoc nnan nsz arcp contract afn float %.i3101, %.i390
+ %56 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 160, i32 0), !invariant.load !0
+ %57 = shufflevector <2 x i32> %56, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %58 = bitcast <4 x i32> %57 to <4 x float>
+ %.i0103 = extractelement <4 x float> %58, i64 0
+ %.i1104 = extractelement <4 x float> %58, i64 1
+ %.i0105 = fadd reassoc nnan nsz arcp contract afn float %.i0103, %19
+ %.i1106 = fadd reassoc nnan nsz arcp contract afn float %.i1104, %16
+ %59 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0105, float %.i1106, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i0107 = extractelement <4 x float> %59, i64 0
+ %.i0108 = fadd reassoc nnan nsz arcp contract afn float %.i0107, %.i096
+ %.i1109 = extractelement <4 x float> %59, i64 1
+ %.i1110 = fadd reassoc nnan nsz arcp contract afn float %.i1109, %.i198
+ %.i2111 = extractelement <4 x float> %59, i64 2
+ %.i2112 = fadd reassoc nnan nsz arcp contract afn float %.i2111, %.i2100
+ %.i3113 = extractelement <4 x float> %59, i64 3
+ %.i3114 = fadd reassoc nnan nsz arcp contract afn float %.i3113, %.i3102
+ %60 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 176, i32 0), !invariant.load !0
+ %61 = shufflevector <2 x i32> %60, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %62 = bitcast <4 x i32> %61 to <4 x float>
+ %.i0115 = extractelement <4 x float> %62, i64 0
+ %.i1116 = extractelement <4 x float> %62, i64 1
+ %.i0117 = fadd reassoc nnan nsz arcp contract afn float %.i0115, %19
+ %.i1118 = fadd reassoc nnan nsz arcp contract afn float %.i1116, %16
+ %63 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0117, float %.i1118, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i0119 = extractelement <4 x float> %63, i64 0
+ %.i0120 = fadd reassoc nnan nsz arcp contract afn float %.i0119, %.i0108
+ %.i1121 = extractelement <4 x float> %63, i64 1
+ %.i1122 = fadd reassoc nnan nsz arcp contract afn float %.i1121, %.i1110
+ %.i2123 = extractelement <4 x float> %63, i64 2
+ %.i2124 = fadd reassoc nnan nsz arcp contract afn float %.i2123, %.i2112
+ %.i3125 = extractelement <4 x float> %63, i64 3
+ %.i3126 = fadd reassoc nnan nsz arcp contract afn float %.i3125, %.i3114
+ %64 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 192, i32 0), !invariant.load !0
+ %65 = shufflevector <2 x i32> %64, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %66 = bitcast <4 x i32> %65 to <4 x float>
+ %.i0127 = extractelement <4 x float> %66, i64 0
+ %.i1128 = extractelement <4 x float> %66, i64 1
+ %.i0129 = fadd reassoc nnan nsz arcp contract afn float %.i0127, %19
+ %.i1130 = fadd reassoc nnan nsz arcp contract afn float %.i1128, %16
+ %67 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0129, float %.i1130, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i0131 = extractelement <4 x float> %67, i64 0
+ %.i0132 = fadd reassoc nnan nsz arcp contract afn float %.i0131, %.i0120
+ %.i1133 = extractelement <4 x float> %67, i64 1
+ %.i1134 = fadd reassoc nnan nsz arcp contract afn float %.i1133, %.i1122
+ %.i2135 = extractelement <4 x float> %67, i64 2
+ %.i2136 = fadd reassoc nnan nsz arcp contract afn float %.i2135, %.i2124
+ %.i3137 = extractelement <4 x float> %67, i64 3
+ %.i3138 = fadd reassoc nnan nsz arcp contract afn float %.i3137, %.i3126
+ %68 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 208, i32 0), !invariant.load !0
+ %69 = shufflevector <2 x i32> %68, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %70 = bitcast <4 x i32> %69 to <4 x float>
+ %.i0139 = extractelement <4 x float> %70, i64 0
+ %.i1140 = extractelement <4 x float> %70, i64 1
+ %.i0141 = fadd reassoc nnan nsz arcp contract afn float %.i0139, %19
+ %.i1142 = fadd reassoc nnan nsz arcp contract afn float %.i1140, %16
+ %71 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0141, float %.i1142, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i0143 = extractelement <4 x float> %71, i64 0
+ %.i0144 = fadd reassoc nnan nsz arcp contract afn float %.i0143, %.i0132
+ %.i1145 = extractelement <4 x float> %71, i64 1
+ %.i1146 = fadd reassoc nnan nsz arcp contract afn float %.i1145, %.i1134
+ %.i2147 = extractelement <4 x float> %71, i64 2
+ %.i2148 = fadd reassoc nnan nsz arcp contract afn float %.i2147, %.i2136
+ %.i3149 = extractelement <4 x float> %71, i64 3
+ %.i3150 = fadd reassoc nnan nsz arcp contract afn float %.i3149, %.i3138
+ %72 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 224, i32 0), !invariant.load !0
+ %73 = shufflevector <2 x i32> %72, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %74 = bitcast <4 x i32> %73 to <4 x float>
+ %.i0151 = extractelement <4 x float> %74, i64 0
+ %.i1152 = extractelement <4 x float> %74, i64 1
+ %.i0153 = fadd reassoc nnan nsz arcp contract afn float %.i0151, %19
+ %.i1154 = fadd reassoc nnan nsz arcp contract afn float %.i1152, %16
+ %75 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0153, float %.i1154, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i0155 = extractelement <4 x float> %75, i64 0
+ %.i0156 = fadd reassoc nnan nsz arcp contract afn float %.i0155, %.i0144
+ %.i1157 = extractelement <4 x float> %75, i64 1
+ %.i1158 = fadd reassoc nnan nsz arcp contract afn float %.i1157, %.i1146
+ %.i2159 = extractelement <4 x float> %75, i64 2
+ %.i2160 = fadd reassoc nnan nsz arcp contract afn float %.i2159, %.i2148
+ %.i3161 = extractelement <4 x float> %75, i64 3
+ %.i3162 = fadd reassoc nnan nsz arcp contract afn float %.i3161, %.i3150
+ %76 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 240, i32 0), !invariant.load !0
+ %77 = shufflevector <2 x i32> %76, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %78 = bitcast <4 x i32> %77 to <4 x float>
+ %.i0163 = extractelement <4 x float> %78, i64 0
+ %.i1164 = extractelement <4 x float> %78, i64 1
+ %.i0165 = fadd reassoc nnan nsz arcp contract afn float %.i0163, %19
+ %.i1166 = fadd reassoc nnan nsz arcp contract afn float %.i1164, %16
+ %79 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0165, float %.i1166, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i0167 = extractelement <4 x float> %79, i64 0
+ %.i0168 = fadd reassoc nnan nsz arcp contract afn float %.i0167, %.i0156
+ %.i1169 = extractelement <4 x float> %79, i64 1
+ %.i1170 = fadd reassoc nnan nsz arcp contract afn float %.i1169, %.i1158
+ %.i2171 = extractelement <4 x float> %79, i64 2
+ %.i2172 = fadd reassoc nnan nsz arcp contract afn float %.i2171, %.i2160
+ %.i3173 = extractelement <4 x float> %79, i64 3
+ %.i3174 = fadd reassoc nnan nsz arcp contract afn float %.i3173, %.i3162
+ %80 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 256, i32 0), !invariant.load !0
+ %81 = shufflevector <2 x i32> %80, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %82 = bitcast <4 x i32> %81 to <4 x float>
+ %.i0175 = extractelement <4 x float> %82, i64 0
+ %.i1176 = extractelement <4 x float> %82, i64 1
+ %.i0177 = fadd reassoc nnan nsz arcp contract afn float %.i0175, %19
+ %.i1178 = fadd reassoc nnan nsz arcp contract afn float %.i1176, %16
+ %83 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0177, float %.i1178, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
+ %.i0179 = extractelement <4 x float> %83, i64 0
+ %.i0180 = fadd reassoc nnan nsz arcp contract afn float %.i0179, %.i0168
+ %.i1181 = extractelement <4 x float> %83, i64 1
+ %.i1182 = fadd reassoc nnan nsz arcp contract afn float %.i1181, %.i1170
+ %.i2183 = extractelement <4 x float> %83, i64 2
+ %.i2184 = fadd reassoc nnan nsz arcp contract afn float %.i2183, %.i2172
+ %.i3185 = extractelement <4 x float> %83, i64 3
+ %.i3186 = fadd reassoc nnan nsz arcp contract afn float %.i3185, %.i3174
+ %84 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i0180, float %.i1182)
+ %85 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i2184, float %.i3186)
+ %86 = bitcast <2 x half> %84 to float
+ %87 = bitcast <2 x half> %85 to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float %86, float %87, float poison, float poison, i1 true, i1 true)
+ ret void
+}
+
+declare noundef i64 @llvm.amdgcn.s.getpc() #3
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #5
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
+declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #4
+declare float @llvm.amdgcn.lds.param.load(i32 immarg, i32 immarg, i32) #3
+declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
+declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
+declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8
+
+attributes #2 = { alwaysinline nounwind memory(readwrite) "InitialPSInputAddr"="2" "amdgpu-color-export"="1" "amdgpu-depth-export"="0" "amdgpu-memory-bound"="false" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,-cumode" }
+attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
+attributes #8 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!0 = !{}
>From 2c5712792b9873bdd1f53ce766093499977300d3 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Tue, 5 Nov 2024 15:44:02 +0800
Subject: [PATCH 2/8] [AMDGPU] Add AMDGPU specific tryCandidate()
The AMDGPU specific version mainly includes two major differences:
1. Try to cluster memory instructions more aggressively.
2. Try to schedule long latency load earlier than short latency
instruction.
I tested locally against about 470 real shaders and got the perf changes:
(only count perf changes over +/-10%):
About 15 shaders improved 10%~40%.
Only 3 shaders drops ~10%.
(This was tested together with another change which increases the maximum clustered dword from 8 to 32).
I will make another change to make that threshold configurable.
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 134 ++++++++++++
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 3 +
.../AMDGPU/group-image-instructions.ll | 199 +++++++++---------
3 files changed, 238 insertions(+), 98 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 57f517bfba0ebb..37802d335fb9fd 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -63,6 +63,10 @@ static cl::opt<bool> GCNTrackers(
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
+static cl::opt<bool> UseAMDGPUScheduleHeuristic(
+ "amdgpu-use-amdgpu-schedule-heuristic", cl::Hidden,
+ cl::desc("Use AMDGPU specific schedule heuristic "), cl::init(false));
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -311,6 +315,136 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
}
}
+/// AMDGPU specific implementation, which is largely copy-pasted from the
+/// generic version, with some modifications to better hide memory latency.
+// Major differences from the generic version:
+// 1. Prioritize clustered operations before stall latency heuristic.
+// 2. Prioritize long-latency-load before stall latency heuristic.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending, or nullptr
+/// if Cand is from a different zone than TryCand.
+/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
+bool GCNSchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ if (!UseAMDGPUScheduleHeuristic)
+ return GenericScheduler::tryCandidate(Cand, TryCand, Zone);
+
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+
+ // Avoid exceeding the target's limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // AMDGPU-specific: We prioritize clustered instructions as we would get more
+ // benefit from clausing these memory instructions.
+ const SUnit *CandNextClusterSU =
+ Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+ const SUnit *TryCandNextClusterSU =
+ TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+ if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+ Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+ return TryCand.Reason != NoCand;
+
+ // We only compare a subset of features when comparing nodes between
+ // Top and Bottom boundary. Some properties are simply incomparable, in many
+ // other instances we should only override the other boundary if something
+ // is a clear good pick on one boundary. Skip heuristics that are more
+ // "tie-breaking" in nature.
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ // For loops that are acyclic path limited, aggressively schedule for
+ // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
+ // heuristics to take precedence.
+ if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+ tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // AMDGPU-specific: Prioritize long latency memory load instructions in
+ // top-bottom order to hide more latency. The mayLoad check is used
+ // to exclude store-like instructions, which we do not want to scheduler
+ // them too early.
+ bool TryMayLoad =
+ TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
+ bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
+
+ if (TryMayLoad || CandMayLoad) {
+ bool TryLongLatency =
+ TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
+ bool CandLongLatency =
+ 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
+
+ if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
+ Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
+ Cand, Stall))
+ return TryCand.Reason != NoCand;
+ }
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+ Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+ }
+
+ if (SameBoundary) {
+ // Weak edges are for clustering and other constraints.
+ if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+ getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
+ return TryCand.Reason != NoCand;
+ }
+
+ // Avoid increasing the max pressure of the entire region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
+ Cand, RegMax, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ if (SameBoundary) {
+ // Avoid critical resource consumption and balance the schedule.
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+
+ // Avoid serializing long latency dependence chains.
+ // For acyclic path limited loops, latency was already checked above.
+ if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+ !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // Fall through to original instruction order.
+ if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
+ (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+ }
+
+ return false;
+}
+
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeFromQueue()
void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 64d517038f90e0..addb05922cee66 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -41,6 +41,9 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
/// heuristics to determine excess/critical pressure sets.
class GCNSchedStrategy : public GenericScheduler {
protected:
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const override;
+
SUnit *pickNodeBidirectional(bool &IsTopNode);
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
index 25ceb296ddc5fe..8644cd3cc1ef85 100644
--- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-use-amdgpu-schedule-heuristic=true -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
; GFX11-LABEL: group_image_sample:
@@ -22,143 +22,146 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
; GFX11-NEXT: s_mov_b64 exec, s[18:19]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x10
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x20
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x30
-; GFX11-NEXT: v_interp_p10_f32 v4, v3, v0, v3 wait_exp:0
-; GFX11-NEXT: v_interp_p10_f32 v0, v2, v0, v2 wait_exp:7
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x40
-; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0x50
-; GFX11-NEXT: v_interp_p2_f32 v42, v3, v1, v4 wait_exp:7
-; GFX11-NEXT: v_interp_p2_f32 v43, v2, v1, v0 wait_exp:7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
+; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v0, s18, v42
-; GFX11-NEXT: v_add_f32_e32 v1, s19, v43
-; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x60
-; GFX11-NEXT: v_add_f32_e32 v8, s20, v42
-; GFX11-NEXT: v_add_f32_e32 v9, s21, v43
+; GFX11-NEXT: v_interp_p2_f32 v45, v2, v1, v4 wait_exp:7
+; GFX11-NEXT: v_interp_p2_f32 v44, v3, v1, v0 wait_exp:7
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_f32_e32 v0, s18, v44
+; GFX11-NEXT: v_add_f32_e32 v1, s19, v45
+; GFX11-NEXT: v_add_f32_e32 v8, s20, v44
+; GFX11-NEXT: v_add_f32_e32 v9, s21, v45
+; GFX11-NEXT: v_add_f32_e32 v16, s24, v44
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[4:7], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: v_add_f32_e32 v0, s22, v42
-; GFX11-NEXT: v_add_f32_e32 v1, s23, v43
-; GFX11-NEXT: v_add_f32_e32 v20, s26, v42
-; GFX11-NEXT: v_add_f32_e32 v21, s27, v43
-; GFX11-NEXT: image_sample v[12:15], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: v_add_f32_e32 v0, s24, v42
-; GFX11-NEXT: v_add_f32_e32 v1, s25, v43
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x70
-; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x80
+; GFX11-NEXT: v_add_f32_e32 v0, s22, v44
+; GFX11-NEXT: v_add_f32_e32 v1, s23, v45
+; GFX11-NEXT: v_add_f32_e32 v17, s25, v45
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v[16:19], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[12:15], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x50
+; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x60
+; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x70
+; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x80
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v0, s18, v42
-; GFX11-NEXT: v_add_f32_e32 v1, s19, v43
-; GFX11-NEXT: image_sample v[24:27], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_add_f32_e32 v0, s18, v44
+; GFX11-NEXT: v_add_f32_e32 v1, s19, v45
+; GFX11-NEXT: v_add_f32_e32 v24, s20, v44
+; GFX11-NEXT: v_add_f32_e32 v25, s21, v45
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample v[20:23], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x90
-; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0xa0
+; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xa0
; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0xb0
; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0xc0
; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0xd0
; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0xe0
; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xf0
; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
-; GFX11-NEXT: v_add_f32_e32 v0, s20, v42
-; GFX11-NEXT: v_add_f32_e32 v1, s21, v43
-; GFX11-NEXT: v_add_f32_e32 v32, s22, v42
-; GFX11-NEXT: v_add_f32_e32 v33, s23, v43
+; GFX11-NEXT: v_add_f32_e32 v0, s22, v44
+; GFX11-NEXT: v_add_f32_e32 v1, s23, v45
+; GFX11-NEXT: v_add_f32_e32 v28, s24, v44
+; GFX11-NEXT: v_add_f32_e32 v29, s25, v45
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v36, s18, v42
-; GFX11-NEXT: v_add_f32_e32 v37, s19, v43
+; GFX11-NEXT: v_add_f32_e32 v30, s18, v44
+; GFX11-NEXT: v_add_f32_e32 v31, s19, v45
+; GFX11-NEXT: v_add_f32_e32 v32, s20, v44
+; GFX11-NEXT: v_add_f32_e32 v33, s21, v45
+; GFX11-NEXT: v_add_f32_e32 v34, s26, v44
+; GFX11-NEXT: v_add_f32_e32 v35, s27, v45
+; GFX11-NEXT: v_add_f32_e32 v36, s28, v44
+; GFX11-NEXT: v_add_f32_e32 v37, s29, v45
+; GFX11-NEXT: v_add_f32_e32 v38, s30, v44
+; GFX11-NEXT: v_add_f32_e32 v39, s31, v45
+; GFX11-NEXT: v_add_f32_e32 v40, s34, v44
+; GFX11-NEXT: v_add_f32_e32 v41, s35, v45
+; GFX11-NEXT: v_add_f32_e32 v42, s36, v44
+; GFX11-NEXT: v_add_f32_e32 v43, s37, v45
+; GFX11-NEXT: v_add_f32_e32 v44, s12, v44
+; GFX11-NEXT: v_add_f32_e32 v45, s13, v45
+; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: v_add_f32_e32 v46, v8, v4
+; GFX11-NEXT: v_add_f32_e32 v47, v9, v5
+; GFX11-NEXT: v_add_f32_e32 v48, v10, v6
+; GFX11-NEXT: v_add_f32_e32 v49, v11, v7
+; GFX11-NEXT: s_and_b64 exec, exec, s[16:17]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v[28:31], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: v_add_f32_e32 v0, s26, v42
-; GFX11-NEXT: v_add_f32_e32 v1, s27, v43
-; GFX11-NEXT: v_add_f32_e32 v38, s24, v42
-; GFX11-NEXT: v_add_f32_e32 v39, s25, v43
-; GFX11-NEXT: v_add_f32_e32 v40, s28, v42
-; GFX11-NEXT: v_add_f32_e32 v41, s29, v43
+; GFX11-NEXT: image_sample v[4:7], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[8:11], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: v_add_f32_e32 v0, v12, v46
+; GFX11-NEXT: v_add_f32_e32 v1, v13, v47
+; GFX11-NEXT: v_add_f32_e32 v46, v14, v48
+; GFX11-NEXT: v_add_f32_e32 v47, v15, v49
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample v[12:15], v[30:31], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[28:31], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: v_add_f32_e32 v44, v8, v4
-; GFX11-NEXT: v_add_f32_e32 v45, v9, v5
-; GFX11-NEXT: v_add_f32_e32 v46, v10, v6
-; GFX11-NEXT: v_add_f32_e32 v47, v11, v7
+; GFX11-NEXT: v_add_f32_e32 v0, v16, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v17, v1
+; GFX11-NEXT: v_add_f32_e32 v46, v18, v46
+; GFX11-NEXT: v_add_f32_e32 v47, v19, v47
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v[4:7], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v[8:11], v[38:39], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[16:19], v[34:35], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[32:35], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: v_add_f32_e32 v44, v12, v44
-; GFX11-NEXT: v_add_f32_e32 v45, v13, v45
-; GFX11-NEXT: v_add_f32_e32 v46, v14, v46
-; GFX11-NEXT: v_add_f32_e32 v47, v15, v47
-; GFX11-NEXT: image_sample v[12:15], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: v_add_f32_e32 v36, s30, v42
-; GFX11-NEXT: v_add_f32_e32 v37, s31, v43
-; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: v_add_f32_e32 v0, v16, v44
-; GFX11-NEXT: v_add_f32_e32 v1, v17, v45
-; GFX11-NEXT: v_add_f32_e32 v44, v18, v46
-; GFX11-NEXT: v_add_f32_e32 v45, v19, v47
-; GFX11-NEXT: image_sample v[16:19], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: v_add_f32_e32 v46, v20, v0
-; GFX11-NEXT: v_add_f32_e32 v47, v21, v1
-; GFX11-NEXT: v_add_f32_e32 v44, v22, v44
-; GFX11-NEXT: v_add_f32_e32 v45, v23, v45
-; GFX11-NEXT: image_sample v[20:23], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: v_add_f32_e32 v38, s34, v42
-; GFX11-NEXT: v_add_f32_e32 v39, s35, v43
-; GFX11-NEXT: v_add_f32_e32 v0, s36, v42
-; GFX11-NEXT: v_add_f32_e32 v1, s37, v43
-; GFX11-NEXT: v_add_f32_e32 v40, s12, v42
-; GFX11-NEXT: v_add_f32_e32 v41, s13, v43
-; GFX11-NEXT: s_and_b64 exec, exec, s[16:17]
-; GFX11-NEXT: image_sample v[36:39], v[38:39], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_add_f32_e32 v0, v20, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v21, v1
+; GFX11-NEXT: v_add_f32_e32 v46, v22, v46
+; GFX11-NEXT: v_add_f32_e32 v47, v23, v47
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample v[20:23], v[38:39], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[36:39], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: v_add_f32_e32 v46, v24, v46
-; GFX11-NEXT: v_add_f32_e32 v47, v25, v47
-; GFX11-NEXT: v_add_f32_e32 v44, v26, v44
-; GFX11-NEXT: v_add_f32_e32 v45, v27, v45
+; GFX11-NEXT: v_add_f32_e32 v0, v24, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v25, v1
+; GFX11-NEXT: v_add_f32_e32 v46, v26, v46
+; GFX11-NEXT: v_add_f32_e32 v47, v27, v47
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v[24:27], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[24:27], v[42:43], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[40:43], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(9)
-; GFX11-NEXT: v_add_f32_e32 v0, v28, v46
-; GFX11-NEXT: v_add_f32_e32 v1, v29, v47
-; GFX11-NEXT: v_add_f32_e32 v28, v30, v44
-; GFX11-NEXT: v_add_f32_e32 v29, v31, v45
-; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: v_add_f32_e32 v0, v32, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v33, v1
-; GFX11-NEXT: v_add_f32_e32 v28, v34, v28
-; GFX11-NEXT: v_add_f32_e32 v29, v35, v29
-; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: v_add_f32_e32 v0, v4, v0
; GFX11-NEXT: v_add_f32_e32 v1, v5, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v6, v28
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v29
-; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_add_f32_e32 v4, v6, v46
+; GFX11-NEXT: v_add_f32_e32 v5, v7, v47
+; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: v_add_f32_e32 v0, v8, v0
; GFX11-NEXT: v_add_f32_e32 v1, v9, v1
; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
; GFX11-NEXT: v_add_f32_e32 v5, v11, v5
-; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: v_add_f32_e32 v0, v12, v0
; GFX11-NEXT: v_add_f32_e32 v1, v13, v1
; GFX11-NEXT: v_add_f32_e32 v4, v14, v4
; GFX11-NEXT: v_add_f32_e32 v5, v15, v5
-; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_add_f32_e32 v0, v28, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v29, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v30, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v31, v5
+; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_add_f32_e32 v0, v16, v0
; GFX11-NEXT: v_add_f32_e32 v1, v17, v1
; GFX11-NEXT: v_add_f32_e32 v4, v18, v4
; GFX11-NEXT: v_add_f32_e32 v5, v19, v5
+; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: v_add_f32_e32 v0, v32, v0
+; GFX11-NEXT: v_add_f32_e32 v1, v33, v1
+; GFX11-NEXT: v_add_f32_e32 v4, v34, v4
+; GFX11-NEXT: v_add_f32_e32 v5, v35, v5
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_add_f32_e32 v0, v20, v0
; GFX11-NEXT: v_add_f32_e32 v1, v21, v1
>From 7c9124e1e1d247bafa602cf646a1e686c84783d3 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Mon, 25 Nov 2024 11:10:41 +0800
Subject: [PATCH 3/8] Move to new MaxMemoryClause schedule strategy
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 24 ++
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 282 +++++++++---------
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 26 +-
.../AMDGPU/group-image-instructions.ll | 2 +-
4 files changed, 194 insertions(+), 140 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 786baa6820e860..f121f524afe543 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -434,6 +434,12 @@ static cl::opt<bool> EnableMaxIlpSchedStrategy(
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
cl::Hidden, cl::init(false));
+static cl::opt<bool> EnableMaxMemoryClauseSchedStrategy(
+ "amdgpu-enable-max-memory-clause-scheduling-strategy",
+ cl::desc("Enable scheduling strategy to maximize memory clause for a "
+ "single wave."),
+ cl::Hidden, cl::init(false));
+
static cl::opt<bool> EnableRewritePartialRegUses(
"amdgpu-enable-rewrite-partial-reg-uses",
cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
@@ -561,6 +567,18 @@ createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
return DAG;
}
+static ScheduleDAGInstrs *
+createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
+ ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
+ C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+ DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ return DAG;
+}
+
static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
@@ -601,6 +619,10 @@ static MachineSchedRegistry
GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
createGCNMaxILPMachineScheduler);
+static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
+ "gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
+ createGCNMaxMemoryClauseMachineScheduler);
+
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
"gcn-iterative-max-occupancy-experimental",
"Run GCN scheduler to maximize occupancy (experimental)",
@@ -1289,6 +1311,8 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
if (EnableMaxIlpSchedStrategy)
return createGCNMaxILPMachineScheduler(C);
+ if (EnableMaxMemoryClauseSchedStrategy)
+ return createGCNMaxMemoryClauseMachineScheduler(C);
return createGCNMaxOccupancyMachineScheduler(C);
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 37802d335fb9fd..7d7bac50009eed 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -63,10 +63,6 @@ static cl::opt<bool> GCNTrackers(
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
-static cl::opt<bool> UseAMDGPUScheduleHeuristic(
- "amdgpu-use-amdgpu-schedule-heuristic", cl::Hidden,
- cl::desc("Use AMDGPU specific schedule heuristic "), cl::init(false));
-
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -315,136 +311,6 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
}
}
-/// AMDGPU specific implementation, which is largely copy-pasted from the
-/// generic version, with some modifications to better hide memory latency.
-// Major differences from the generic version:
-// 1. Prioritize clustered operations before stall latency heuristic.
-// 2. Prioritize long-latency-load before stall latency heuristic.
-///
-/// \param Cand provides the policy and current best candidate.
-/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
-/// \param Zone describes the scheduled zone that we are extending, or nullptr
-/// if Cand is from a different zone than TryCand.
-/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
-bool GCNSchedStrategy::tryCandidate(SchedCandidate &Cand,
- SchedCandidate &TryCand,
- SchedBoundary *Zone) const {
- if (!UseAMDGPUScheduleHeuristic)
- return GenericScheduler::tryCandidate(Cand, TryCand, Zone);
-
- // Initialize the candidate if needed.
- if (!Cand.isValid()) {
- TryCand.Reason = NodeOrder;
- return true;
- }
-
- // Bias PhysReg Defs and copies to their uses and defined respectively.
- if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
- biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
- return TryCand.Reason != NoCand;
-
- // Avoid exceeding the target's limit.
- if (DAG->isTrackingPressure() &&
- tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
- RegExcess, TRI, DAG->MF))
- return TryCand.Reason != NoCand;
-
- // Avoid increasing the max critical pressure in the scheduled region.
- if (DAG->isTrackingPressure() &&
- tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
- TryCand, Cand, RegCritical, TRI, DAG->MF))
- return TryCand.Reason != NoCand;
-
- // AMDGPU-specific: We prioritize clustered instructions as we would get more
- // benefit from clausing these memory instructions.
- const SUnit *CandNextClusterSU =
- Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
- const SUnit *TryCandNextClusterSU =
- TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
- if (tryGreater(TryCand.SU == TryCandNextClusterSU,
- Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
- return TryCand.Reason != NoCand;
-
- // We only compare a subset of features when comparing nodes between
- // Top and Bottom boundary. Some properties are simply incomparable, in many
- // other instances we should only override the other boundary if something
- // is a clear good pick on one boundary. Skip heuristics that are more
- // "tie-breaking" in nature.
- bool SameBoundary = Zone != nullptr;
- if (SameBoundary) {
- // For loops that are acyclic path limited, aggressively schedule for
- // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
- // heuristics to take precedence.
- if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
- tryLatency(TryCand, Cand, *Zone))
- return TryCand.Reason != NoCand;
-
- // AMDGPU-specific: Prioritize long latency memory load instructions in
- // top-bottom order to hide more latency. The mayLoad check is used
- // to exclude store-like instructions, which we do not want to scheduler
- // them too early.
- bool TryMayLoad =
- TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
- bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
-
- if (TryMayLoad || CandMayLoad) {
- bool TryLongLatency =
- TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
- bool CandLongLatency =
- 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
-
- if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
- Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
- Cand, Stall))
- return TryCand.Reason != NoCand;
- }
- // Prioritize instructions that read unbuffered resources by stall cycles.
- if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
- Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
- return TryCand.Reason != NoCand;
- }
-
- if (SameBoundary) {
- // Weak edges are for clustering and other constraints.
- if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
- getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
- return TryCand.Reason != NoCand;
- }
-
- // Avoid increasing the max pressure of the entire region.
- if (DAG->isTrackingPressure() &&
- tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
- Cand, RegMax, TRI, DAG->MF))
- return TryCand.Reason != NoCand;
-
- if (SameBoundary) {
- // Avoid critical resource consumption and balance the schedule.
- TryCand.initResourceDelta(DAG, SchedModel);
- if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
- TryCand, Cand, ResourceReduce))
- return TryCand.Reason != NoCand;
- if (tryGreater(TryCand.ResDelta.DemandedResources,
- Cand.ResDelta.DemandedResources, TryCand, Cand,
- ResourceDemand))
- return TryCand.Reason != NoCand;
-
- // Avoid serializing long latency dependence chains.
- // For acyclic path limited loops, latency was already checked above.
- if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
- !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
- return TryCand.Reason != NoCand;
-
- // Fall through to original instruction order.
- if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
- (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
- TryCand.Reason = NodeOrder;
- return true;
- }
- }
-
- return false;
-}
-
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeFromQueue()
void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
@@ -749,6 +615,137 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
return false;
}
+GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
+ const MachineSchedContext *C)
+ : GCNSchedStrategy(C) {
+ SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
+}
+/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
+/// much as possible. This is achieved by:
+// 1. Prioritize clustered operations before stall latency heuristic.
+// 2. Prioritize long-latency-load before stall latency heuristic.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending, or nullptr
+/// if Cand is from a different zone than TryCand.
+/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
+bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+
+ // Avoid exceeding the target's limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // MaxMemoryClause-specific: We prioritize clustered instructions as we would
+ // get more benefit from clausing these memory instructions.
+ const SUnit *CandNextClusterSU =
+ Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+ const SUnit *TryCandNextClusterSU =
+ TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+ if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+ Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
+ return TryCand.Reason != NoCand;
+
+ // We only compare a subset of features when comparing nodes between
+ // Top and Bottom boundary. Some properties are simply incomparable, in many
+ // other instances we should only override the other boundary if something
+ // is a clear good pick on one boundary. Skip heuristics that are more
+ // "tie-breaking" in nature.
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ // For loops that are acyclic path limited, aggressively schedule for
+ // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
+ // heuristics to take precedence.
+ if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+ tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // MaxMemoryClause-specific: Prioritize long latency memory load
+ // instructions in top-bottom order to hide more latency. The mayLoad check
+ // is used to exclude store-like instructions, which we do not want to
+ // scheduler them too early.
+ bool TryMayLoad =
+ TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
+ bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
+
+ if (TryMayLoad || CandMayLoad) {
+ bool TryLongLatency =
+ TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
+ bool CandLongLatency =
+ 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
+
+ if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
+ Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
+ Cand, Stall))
+ return TryCand.Reason != NoCand;
+ }
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+ Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+ }
+
+ if (SameBoundary) {
+ // Weak edges are for clustering and other constraints.
+ if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+ getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
+ return TryCand.Reason != NoCand;
+ }
+
+ // Avoid increasing the max pressure of the entire region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
+ Cand, RegMax, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ if (SameBoundary) {
+ // Avoid critical resource consumption and balance the schedule.
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+
+ // Avoid serializing long latency dependence chains.
+ // For acyclic path limited loops, latency was already checked above.
+ if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+ !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // Fall through to original instruction order.
+ if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
+ (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+ }
+
+ return false;
+}
+
GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
@@ -778,6 +775,9 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
return std::make_unique<PreRARematStage>(SchedStageID, *this);
case GCNSchedStageID::ILPInitialSchedule:
return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
+ case GCNSchedStageID::MemoryClauseInitialSchedule:
+ return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
+ *this);
}
llvm_unreachable("Unknown SchedStageID.");
@@ -1003,6 +1003,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
case GCNSchedStageID::ILPInitialSchedule:
OS << "Max ILP Initial Schedule";
break;
+ case GCNSchedStageID::MemoryClauseInitialSchedule:
+ OS << "Max memory clause Initial Schedule";
+ break;
}
return OS;
@@ -1222,7 +1225,8 @@ void GCNSchedStage::setupNewBlock() {
// Get real RP for the region if it hasn't be calculated before. After the
// initial schedule stage real RP will be collected after scheduling.
if (StageID == GCNSchedStageID::OccInitialSchedule ||
- StageID == GCNSchedStageID::ILPInitialSchedule)
+ StageID == GCNSchedStageID::ILPInitialSchedule ||
+ StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
DAG.computeBlockPressure(RegionIdx, CurrentMBB);
}
@@ -1523,6 +1527,14 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
return false;
}
+bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
+ unsigned WavesAfter) {
+ if (mayCauseSpilling(WavesAfter))
+ return true;
+
+ return false;
+}
+
bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
!PressureAfter.less(MF, PressureBefore)) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index addb05922cee66..44db834a41f828 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -29,7 +29,8 @@ enum class GCNSchedStageID : unsigned {
UnclusteredHighRPReschedule = 1,
ClusteredLowOccupancyReschedule = 2,
PreRARematerialize = 3,
- ILPInitialSchedule = 4
+ ILPInitialSchedule = 4,
+ MemoryClauseInitialSchedule = 5
};
#ifndef NDEBUG
@@ -41,9 +42,6 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
/// heuristics to determine excess/critical pressure sets.
class GCNSchedStrategy : public GenericScheduler {
protected:
- bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
- SchedBoundary *Zone) const override;
-
SUnit *pickNodeBidirectional(bool &IsTopNode);
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
@@ -152,6 +150,17 @@ class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
GCNMaxILPSchedStrategy(const MachineSchedContext *C);
};
+/// The goal of this scheduling strategy is to maximize memory clause for a
+/// single wave.
+class GCNMaxMemoryClauseSchedStrategy final : public GCNSchedStrategy {
+protected:
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const override;
+
+public:
+ GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C);
+};
+
class ScheduleMetrics {
unsigned ScheduleLength;
unsigned BubbleCycles;
@@ -466,6 +475,15 @@ class ILPInitialScheduleStage : public GCNSchedStage {
: GCNSchedStage(StageID, DAG) {}
};
+class MemoryClauseInitialScheduleStage : public GCNSchedStage {
+public:
+ bool shouldRevertScheduling(unsigned WavesAfter) override;
+
+ MemoryClauseInitialScheduleStage(GCNSchedStageID StageID,
+ GCNScheduleDAGMILive &DAG)
+ : GCNSchedStage(StageID, DAG) {}
+};
+
class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
private:
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
index 8644cd3cc1ef85..011e54b10efc3f 100644
--- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-use-amdgpu-schedule-heuristic=true -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-max-memory-clause-scheduling-strategy -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
; GFX11-LABEL: group_image_sample:
>From 3bcfc79dfc7b24485c01abe1b3c1fb41d2c7310f Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Mon, 25 Nov 2024 11:37:52 +0800
Subject: [PATCH 4/8] Use named IR values
---
.../AMDGPU/group-image-instructions.ll | 434 +++++++++---------
1 file changed, 217 insertions(+), 217 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
index 011e54b10efc3f..5ec4a6ed848156 100644
--- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -188,285 +188,285 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: exp mrt0 v0, v1, off, off done
; GFX11-NEXT: s_endpgm
.entry:
- %0 = call i64 @llvm.amdgcn.s.getpc()
- %1 = and i64 %0, -4294967296
- %2 = zext i32 %userdata6 to i64
- %3 = or disjoint i64 %1, %2
- %4 = inttoptr i64 %3 to ptr addrspace(4)
- %5 = load <4 x i32>, ptr addrspace(4) %4, align 16
- %6 = zext i32 %userdata7 to i64
- %7 = or disjoint i64 %1, %6
- %8 = inttoptr i64 %7 to ptr addrspace(4)
- %9 = load <4 x i32>, ptr addrspace(4) %8, align 4, !invariant.load !0
- %10 = zext i32 %userdata8 to i64
- %11 = or disjoint i64 %1, %10
- %12 = inttoptr i64 %11 to ptr addrspace(4)
- %13 = load <8 x i32>, ptr addrspace(4) %12, align 4, !invariant.load !0
- %14 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %PrimMask)
+ %i = call i64 @llvm.amdgcn.s.getpc()
+ %i1 = and i64 %i, -4294967296
+ %i2 = zext i32 %userdata6 to i64
+ %i3 = or disjoint i64 %i1, %i2
+ %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+ %i5 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+ %i6 = zext i32 %userdata7 to i64
+ %i7 = or disjoint i64 %i1, %i6
+ %i8 = inttoptr i64 %i7 to ptr addrspace(4)
+ %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 4, !invariant.load !0
+ %i10 = zext i32 %userdata8 to i64
+ %i11 = or disjoint i64 %i1, %i10
+ %i12 = inttoptr i64 %i11 to ptr addrspace(4)
+ %i13 = load <8 x i32>, ptr addrspace(4) %i12, align 4, !invariant.load !0
+ %i14 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %PrimMask)
%PerspInterpCenter.i1 = extractelement <2 x float> %PerspInterpCenter, i64 1
%PerspInterpCenter.i0 = extractelement <2 x float> %PerspInterpCenter, i64 0
- %15 = call float @llvm.amdgcn.interp.inreg.p10(float %14, float %PerspInterpCenter.i0, float %14)
- %16 = call float @llvm.amdgcn.interp.inreg.p2(float %14, float %PerspInterpCenter.i1, float %15)
- %17 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %PrimMask)
- %18 = call float @llvm.amdgcn.interp.inreg.p10(float %17, float %PerspInterpCenter.i0, float %17)
- %19 = call float @llvm.amdgcn.interp.inreg.p2(float %17, float %PerspInterpCenter.i1, float %18)
- %20 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 16, i32 0), !invariant.load !0
- %21 = shufflevector <2 x i32> %20, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %22 = bitcast <4 x i32> %21 to <4 x float>
- %.i0 = extractelement <4 x float> %22, i64 0
- %.i1 = extractelement <4 x float> %22, i64 1
- %.i03 = fadd reassoc nnan nsz arcp contract afn float %.i0, %19
- %.i14 = fadd reassoc nnan nsz arcp contract afn float %.i1, %16
- %23 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i03, float %.i14, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i010 = extractelement <4 x float> %23, i64 0
- %.i113 = extractelement <4 x float> %23, i64 1
- %.i215 = extractelement <4 x float> %23, i64 2
- %.i317 = extractelement <4 x float> %23, i64 3
- %24 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 32, i32 0), !invariant.load !0
- %25 = shufflevector <2 x i32> %24, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %26 = bitcast <4 x i32> %25 to <4 x float>
- %.i05 = extractelement <4 x float> %26, i64 0
- %.i16 = extractelement <4 x float> %26, i64 1
- %.i07 = fadd reassoc nnan nsz arcp contract afn float %.i05, %19
- %.i18 = fadd reassoc nnan nsz arcp contract afn float %.i16, %16
- %27 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i07, float %.i18, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i09 = extractelement <4 x float> %27, i64 0
+ %i15 = call float @llvm.amdgcn.interp.inreg.p10(float %i14, float %PerspInterpCenter.i0, float %i14)
+ %i16 = call float @llvm.amdgcn.interp.inreg.p2(float %i14, float %PerspInterpCenter.i1, float %i15)
+ %i17 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %PrimMask)
+ %i18 = call float @llvm.amdgcn.interp.inreg.p10(float %i17, float %PerspInterpCenter.i0, float %i17)
+ %i19 = call float @llvm.amdgcn.interp.inreg.p2(float %i17, float %PerspInterpCenter.i1, float %i18)
+ %i20 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 16, i32 0), !invariant.load !0
+ %i21 = shufflevector <2 x i32> %i20, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i22 = bitcast <4 x i32> %i21 to <4 x float>
+ %.i0 = extractelement <4 x float> %i22, i64 0
+ %.i1 = extractelement <4 x float> %i22, i64 1
+ %.i03 = fadd reassoc nnan nsz arcp contract afn float %.i0, %i19
+ %.i14 = fadd reassoc nnan nsz arcp contract afn float %.i1, %i16
+ %i23 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i03, float %.i14, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i010 = extractelement <4 x float> %i23, i64 0
+ %.i113 = extractelement <4 x float> %i23, i64 1
+ %.i215 = extractelement <4 x float> %i23, i64 2
+ %.i317 = extractelement <4 x float> %i23, i64 3
+ %i24 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 32, i32 0), !invariant.load !0
+ %i25 = shufflevector <2 x i32> %i24, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i26 = bitcast <4 x i32> %i25 to <4 x float>
+ %.i05 = extractelement <4 x float> %i26, i64 0
+ %.i16 = extractelement <4 x float> %i26, i64 1
+ %.i07 = fadd reassoc nnan nsz arcp contract afn float %.i05, %i19
+ %.i18 = fadd reassoc nnan nsz arcp contract afn float %.i16, %i16
+ %i27 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i07, float %.i18, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i09 = extractelement <4 x float> %i27, i64 0
%.i011 = fadd reassoc nnan nsz arcp contract afn float %.i09, %.i010
- %.i112 = extractelement <4 x float> %27, i64 1
+ %.i112 = extractelement <4 x float> %i27, i64 1
%.i114 = fadd reassoc nnan nsz arcp contract afn float %.i112, %.i113
- %.i2 = extractelement <4 x float> %27, i64 2
+ %.i2 = extractelement <4 x float> %i27, i64 2
%.i216 = fadd reassoc nnan nsz arcp contract afn float %.i2, %.i215
- %.i3 = extractelement <4 x float> %27, i64 3
+ %.i3 = extractelement <4 x float> %i27, i64 3
%.i318 = fadd reassoc nnan nsz arcp contract afn float %.i3, %.i317
- %28 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 48, i32 0), !invariant.load !0
- %29 = shufflevector <2 x i32> %28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %30 = bitcast <4 x i32> %29 to <4 x float>
- %.i019 = extractelement <4 x float> %30, i64 0
- %.i120 = extractelement <4 x float> %30, i64 1
- %.i021 = fadd reassoc nnan nsz arcp contract afn float %.i019, %19
- %.i122 = fadd reassoc nnan nsz arcp contract afn float %.i120, %16
- %31 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i021, float %.i122, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i023 = extractelement <4 x float> %31, i64 0
+ %i28 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 48, i32 0), !invariant.load !0
+ %i29 = shufflevector <2 x i32> %i28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i30 = bitcast <4 x i32> %i29 to <4 x float>
+ %.i019 = extractelement <4 x float> %i30, i64 0
+ %.i120 = extractelement <4 x float> %i30, i64 1
+ %.i021 = fadd reassoc nnan nsz arcp contract afn float %.i019, %i19
+ %.i122 = fadd reassoc nnan nsz arcp contract afn float %.i120, %i16
+ %i31 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i021, float %.i122, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i023 = extractelement <4 x float> %i31, i64 0
%.i024 = fadd reassoc nnan nsz arcp contract afn float %.i023, %.i011
- %.i125 = extractelement <4 x float> %31, i64 1
+ %.i125 = extractelement <4 x float> %i31, i64 1
%.i126 = fadd reassoc nnan nsz arcp contract afn float %.i125, %.i114
- %.i227 = extractelement <4 x float> %31, i64 2
+ %.i227 = extractelement <4 x float> %i31, i64 2
%.i228 = fadd reassoc nnan nsz arcp contract afn float %.i227, %.i216
- %.i329 = extractelement <4 x float> %31, i64 3
+ %.i329 = extractelement <4 x float> %i31, i64 3
%.i330 = fadd reassoc nnan nsz arcp contract afn float %.i329, %.i318
- %32 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 64, i32 0), !invariant.load !0
- %33 = shufflevector <2 x i32> %32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %34 = bitcast <4 x i32> %33 to <4 x float>
- %.i031 = extractelement <4 x float> %34, i64 0
- %.i132 = extractelement <4 x float> %34, i64 1
- %.i033 = fadd reassoc nnan nsz arcp contract afn float %.i031, %19
- %.i134 = fadd reassoc nnan nsz arcp contract afn float %.i132, %16
- %35 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i033, float %.i134, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i035 = extractelement <4 x float> %35, i64 0
+ %i32 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 64, i32 0), !invariant.load !0
+ %i33 = shufflevector <2 x i32> %i32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i34 = bitcast <4 x i32> %i33 to <4 x float>
+ %.i031 = extractelement <4 x float> %i34, i64 0
+ %.i132 = extractelement <4 x float> %i34, i64 1
+ %.i033 = fadd reassoc nnan nsz arcp contract afn float %.i031, %i19
+ %.i134 = fadd reassoc nnan nsz arcp contract afn float %.i132, %i16
+ %i35 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i033, float %.i134, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i035 = extractelement <4 x float> %i35, i64 0
%.i036 = fadd reassoc nnan nsz arcp contract afn float %.i035, %.i024
- %.i137 = extractelement <4 x float> %35, i64 1
+ %.i137 = extractelement <4 x float> %i35, i64 1
%.i138 = fadd reassoc nnan nsz arcp contract afn float %.i137, %.i126
- %.i239 = extractelement <4 x float> %35, i64 2
+ %.i239 = extractelement <4 x float> %i35, i64 2
%.i240 = fadd reassoc nnan nsz arcp contract afn float %.i239, %.i228
- %.i341 = extractelement <4 x float> %35, i64 3
+ %.i341 = extractelement <4 x float> %i35, i64 3
%.i342 = fadd reassoc nnan nsz arcp contract afn float %.i341, %.i330
- %36 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 80, i32 0), !invariant.load !0
- %37 = shufflevector <2 x i32> %36, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %38 = bitcast <4 x i32> %37 to <4 x float>
- %.i043 = extractelement <4 x float> %38, i64 0
- %.i144 = extractelement <4 x float> %38, i64 1
- %.i045 = fadd reassoc nnan nsz arcp contract afn float %.i043, %19
- %.i146 = fadd reassoc nnan nsz arcp contract afn float %.i144, %16
- %39 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i045, float %.i146, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i047 = extractelement <4 x float> %39, i64 0
+ %i36 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 80, i32 0), !invariant.load !0
+ %i37 = shufflevector <2 x i32> %i36, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i38 = bitcast <4 x i32> %i37 to <4 x float>
+ %.i043 = extractelement <4 x float> %i38, i64 0
+ %.i144 = extractelement <4 x float> %i38, i64 1
+ %.i045 = fadd reassoc nnan nsz arcp contract afn float %.i043, %i19
+ %.i146 = fadd reassoc nnan nsz arcp contract afn float %.i144, %i16
+ %i39 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i045, float %.i146, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i047 = extractelement <4 x float> %i39, i64 0
%.i048 = fadd reassoc nnan nsz arcp contract afn float %.i047, %.i036
- %.i149 = extractelement <4 x float> %39, i64 1
+ %.i149 = extractelement <4 x float> %i39, i64 1
%.i150 = fadd reassoc nnan nsz arcp contract afn float %.i149, %.i138
- %.i251 = extractelement <4 x float> %39, i64 2
+ %.i251 = extractelement <4 x float> %i39, i64 2
%.i252 = fadd reassoc nnan nsz arcp contract afn float %.i251, %.i240
- %.i353 = extractelement <4 x float> %39, i64 3
+ %.i353 = extractelement <4 x float> %i39, i64 3
%.i354 = fadd reassoc nnan nsz arcp contract afn float %.i353, %.i342
- %40 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 96, i32 0), !invariant.load !0
- %41 = shufflevector <2 x i32> %40, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %42 = bitcast <4 x i32> %41 to <4 x float>
- %.i055 = extractelement <4 x float> %42, i64 0
- %.i156 = extractelement <4 x float> %42, i64 1
- %.i057 = fadd reassoc nnan nsz arcp contract afn float %.i055, %19
- %.i158 = fadd reassoc nnan nsz arcp contract afn float %.i156, %16
- %43 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i057, float %.i158, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i059 = extractelement <4 x float> %43, i64 0
+ %i40 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 96, i32 0), !invariant.load !0
+ %i41 = shufflevector <2 x i32> %i40, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i42 = bitcast <4 x i32> %i41 to <4 x float>
+ %.i055 = extractelement <4 x float> %i42, i64 0
+ %.i156 = extractelement <4 x float> %i42, i64 1
+ %.i057 = fadd reassoc nnan nsz arcp contract afn float %.i055, %i19
+ %.i158 = fadd reassoc nnan nsz arcp contract afn float %.i156, %i16
+ %i43 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i057, float %.i158, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i059 = extractelement <4 x float> %i43, i64 0
%.i060 = fadd reassoc nnan nsz arcp contract afn float %.i059, %.i048
- %.i161 = extractelement <4 x float> %43, i64 1
+ %.i161 = extractelement <4 x float> %i43, i64 1
%.i162 = fadd reassoc nnan nsz arcp contract afn float %.i161, %.i150
- %.i263 = extractelement <4 x float> %43, i64 2
+ %.i263 = extractelement <4 x float> %i43, i64 2
%.i264 = fadd reassoc nnan nsz arcp contract afn float %.i263, %.i252
- %.i365 = extractelement <4 x float> %43, i64 3
+ %.i365 = extractelement <4 x float> %i43, i64 3
%.i366 = fadd reassoc nnan nsz arcp contract afn float %.i365, %.i354
- %44 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 112, i32 0), !invariant.load !0
- %45 = shufflevector <2 x i32> %44, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %46 = bitcast <4 x i32> %45 to <4 x float>
- %.i067 = extractelement <4 x float> %46, i64 0
- %.i168 = extractelement <4 x float> %46, i64 1
- %.i069 = fadd reassoc nnan nsz arcp contract afn float %.i067, %19
- %.i170 = fadd reassoc nnan nsz arcp contract afn float %.i168, %16
- %47 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i069, float %.i170, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i071 = extractelement <4 x float> %47, i64 0
+ %i44 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 112, i32 0), !invariant.load !0
+ %i45 = shufflevector <2 x i32> %i44, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i46 = bitcast <4 x i32> %i45 to <4 x float>
+ %.i067 = extractelement <4 x float> %i46, i64 0
+ %.i168 = extractelement <4 x float> %i46, i64 1
+ %.i069 = fadd reassoc nnan nsz arcp contract afn float %.i067, %i19
+ %.i170 = fadd reassoc nnan nsz arcp contract afn float %.i168, %i16
+ %i47 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i069, float %.i170, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i071 = extractelement <4 x float> %i47, i64 0
%.i072 = fadd reassoc nnan nsz arcp contract afn float %.i071, %.i060
- %.i173 = extractelement <4 x float> %47, i64 1
+ %.i173 = extractelement <4 x float> %i47, i64 1
%.i174 = fadd reassoc nnan nsz arcp contract afn float %.i173, %.i162
- %.i275 = extractelement <4 x float> %47, i64 2
+ %.i275 = extractelement <4 x float> %i47, i64 2
%.i276 = fadd reassoc nnan nsz arcp contract afn float %.i275, %.i264
- %.i377 = extractelement <4 x float> %47, i64 3
+ %.i377 = extractelement <4 x float> %i47, i64 3
%.i378 = fadd reassoc nnan nsz arcp contract afn float %.i377, %.i366
- %48 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 128, i32 0), !invariant.load !0
- %49 = shufflevector <2 x i32> %48, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %50 = bitcast <4 x i32> %49 to <4 x float>
- %.i079 = extractelement <4 x float> %50, i64 0
- %.i180 = extractelement <4 x float> %50, i64 1
- %.i081 = fadd reassoc nnan nsz arcp contract afn float %.i079, %19
- %.i182 = fadd reassoc nnan nsz arcp contract afn float %.i180, %16
- %51 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i081, float %.i182, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i083 = extractelement <4 x float> %51, i64 0
+ %i48 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 128, i32 0), !invariant.load !0
+ %i49 = shufflevector <2 x i32> %i48, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i50 = bitcast <4 x i32> %i49 to <4 x float>
+ %.i079 = extractelement <4 x float> %i50, i64 0
+ %.i180 = extractelement <4 x float> %i50, i64 1
+ %.i081 = fadd reassoc nnan nsz arcp contract afn float %.i079, %i19
+ %.i182 = fadd reassoc nnan nsz arcp contract afn float %.i180, %i16
+ %i51 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i081, float %.i182, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i083 = extractelement <4 x float> %i51, i64 0
%.i084 = fadd reassoc nnan nsz arcp contract afn float %.i083, %.i072
- %.i185 = extractelement <4 x float> %51, i64 1
+ %.i185 = extractelement <4 x float> %i51, i64 1
%.i186 = fadd reassoc nnan nsz arcp contract afn float %.i185, %.i174
- %.i287 = extractelement <4 x float> %51, i64 2
+ %.i287 = extractelement <4 x float> %i51, i64 2
%.i288 = fadd reassoc nnan nsz arcp contract afn float %.i287, %.i276
- %.i389 = extractelement <4 x float> %51, i64 3
+ %.i389 = extractelement <4 x float> %i51, i64 3
%.i390 = fadd reassoc nnan nsz arcp contract afn float %.i389, %.i378
- %52 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 144, i32 0), !invariant.load !0
- %53 = shufflevector <2 x i32> %52, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %54 = bitcast <4 x i32> %53 to <4 x float>
- %.i091 = extractelement <4 x float> %54, i64 0
- %.i192 = extractelement <4 x float> %54, i64 1
- %.i093 = fadd reassoc nnan nsz arcp contract afn float %.i091, %19
- %.i194 = fadd reassoc nnan nsz arcp contract afn float %.i192, %16
- %55 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i093, float %.i194, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i095 = extractelement <4 x float> %55, i64 0
+ %i52 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 144, i32 0), !invariant.load !0
+ %i53 = shufflevector <2 x i32> %i52, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i54 = bitcast <4 x i32> %i53 to <4 x float>
+ %.i091 = extractelement <4 x float> %i54, i64 0
+ %.i192 = extractelement <4 x float> %i54, i64 1
+ %.i093 = fadd reassoc nnan nsz arcp contract afn float %.i091, %i19
+ %.i194 = fadd reassoc nnan nsz arcp contract afn float %.i192, %i16
+ %i55 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i093, float %.i194, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i095 = extractelement <4 x float> %i55, i64 0
%.i096 = fadd reassoc nnan nsz arcp contract afn float %.i095, %.i084
- %.i197 = extractelement <4 x float> %55, i64 1
+ %.i197 = extractelement <4 x float> %i55, i64 1
%.i198 = fadd reassoc nnan nsz arcp contract afn float %.i197, %.i186
- %.i299 = extractelement <4 x float> %55, i64 2
+ %.i299 = extractelement <4 x float> %i55, i64 2
%.i2100 = fadd reassoc nnan nsz arcp contract afn float %.i299, %.i288
- %.i3101 = extractelement <4 x float> %55, i64 3
+ %.i3101 = extractelement <4 x float> %i55, i64 3
%.i3102 = fadd reassoc nnan nsz arcp contract afn float %.i3101, %.i390
- %56 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 160, i32 0), !invariant.load !0
- %57 = shufflevector <2 x i32> %56, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %58 = bitcast <4 x i32> %57 to <4 x float>
- %.i0103 = extractelement <4 x float> %58, i64 0
- %.i1104 = extractelement <4 x float> %58, i64 1
- %.i0105 = fadd reassoc nnan nsz arcp contract afn float %.i0103, %19
- %.i1106 = fadd reassoc nnan nsz arcp contract afn float %.i1104, %16
- %59 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0105, float %.i1106, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i0107 = extractelement <4 x float> %59, i64 0
+ %i56 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 160, i32 0), !invariant.load !0
+ %i57 = shufflevector <2 x i32> %i56, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i58 = bitcast <4 x i32> %i57 to <4 x float>
+ %.i0103 = extractelement <4 x float> %i58, i64 0
+ %.i1104 = extractelement <4 x float> %i58, i64 1
+ %.i0105 = fadd reassoc nnan nsz arcp contract afn float %.i0103, %i19
+ %.i1106 = fadd reassoc nnan nsz arcp contract afn float %.i1104, %i16
+ %i59 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0105, float %.i1106, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i0107 = extractelement <4 x float> %i59, i64 0
%.i0108 = fadd reassoc nnan nsz arcp contract afn float %.i0107, %.i096
- %.i1109 = extractelement <4 x float> %59, i64 1
+ %.i1109 = extractelement <4 x float> %i59, i64 1
%.i1110 = fadd reassoc nnan nsz arcp contract afn float %.i1109, %.i198
- %.i2111 = extractelement <4 x float> %59, i64 2
+ %.i2111 = extractelement <4 x float> %i59, i64 2
%.i2112 = fadd reassoc nnan nsz arcp contract afn float %.i2111, %.i2100
- %.i3113 = extractelement <4 x float> %59, i64 3
+ %.i3113 = extractelement <4 x float> %i59, i64 3
%.i3114 = fadd reassoc nnan nsz arcp contract afn float %.i3113, %.i3102
- %60 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 176, i32 0), !invariant.load !0
- %61 = shufflevector <2 x i32> %60, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %62 = bitcast <4 x i32> %61 to <4 x float>
- %.i0115 = extractelement <4 x float> %62, i64 0
- %.i1116 = extractelement <4 x float> %62, i64 1
- %.i0117 = fadd reassoc nnan nsz arcp contract afn float %.i0115, %19
- %.i1118 = fadd reassoc nnan nsz arcp contract afn float %.i1116, %16
- %63 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0117, float %.i1118, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i0119 = extractelement <4 x float> %63, i64 0
+ %i60 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 176, i32 0), !invariant.load !0
+ %i61 = shufflevector <2 x i32> %i60, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i62 = bitcast <4 x i32> %i61 to <4 x float>
+ %.i0115 = extractelement <4 x float> %i62, i64 0
+ %.i1116 = extractelement <4 x float> %i62, i64 1
+ %.i0117 = fadd reassoc nnan nsz arcp contract afn float %.i0115, %i19
+ %.i1118 = fadd reassoc nnan nsz arcp contract afn float %.i1116, %i16
+ %i63 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0117, float %.i1118, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i0119 = extractelement <4 x float> %i63, i64 0
%.i0120 = fadd reassoc nnan nsz arcp contract afn float %.i0119, %.i0108
- %.i1121 = extractelement <4 x float> %63, i64 1
+ %.i1121 = extractelement <4 x float> %i63, i64 1
%.i1122 = fadd reassoc nnan nsz arcp contract afn float %.i1121, %.i1110
- %.i2123 = extractelement <4 x float> %63, i64 2
+ %.i2123 = extractelement <4 x float> %i63, i64 2
%.i2124 = fadd reassoc nnan nsz arcp contract afn float %.i2123, %.i2112
- %.i3125 = extractelement <4 x float> %63, i64 3
+ %.i3125 = extractelement <4 x float> %i63, i64 3
%.i3126 = fadd reassoc nnan nsz arcp contract afn float %.i3125, %.i3114
- %64 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 192, i32 0), !invariant.load !0
- %65 = shufflevector <2 x i32> %64, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %66 = bitcast <4 x i32> %65 to <4 x float>
- %.i0127 = extractelement <4 x float> %66, i64 0
- %.i1128 = extractelement <4 x float> %66, i64 1
- %.i0129 = fadd reassoc nnan nsz arcp contract afn float %.i0127, %19
- %.i1130 = fadd reassoc nnan nsz arcp contract afn float %.i1128, %16
- %67 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0129, float %.i1130, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i0131 = extractelement <4 x float> %67, i64 0
+ %i64 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 192, i32 0), !invariant.load !0
+ %i65 = shufflevector <2 x i32> %i64, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i66 = bitcast <4 x i32> %i65 to <4 x float>
+ %.i0127 = extractelement <4 x float> %i66, i64 0
+ %.i1128 = extractelement <4 x float> %i66, i64 1
+ %.i0129 = fadd reassoc nnan nsz arcp contract afn float %.i0127, %i19
+ %.i1130 = fadd reassoc nnan nsz arcp contract afn float %.i1128, %i16
+ %i67 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0129, float %.i1130, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i0131 = extractelement <4 x float> %i67, i64 0
%.i0132 = fadd reassoc nnan nsz arcp contract afn float %.i0131, %.i0120
- %.i1133 = extractelement <4 x float> %67, i64 1
+ %.i1133 = extractelement <4 x float> %i67, i64 1
%.i1134 = fadd reassoc nnan nsz arcp contract afn float %.i1133, %.i1122
- %.i2135 = extractelement <4 x float> %67, i64 2
+ %.i2135 = extractelement <4 x float> %i67, i64 2
%.i2136 = fadd reassoc nnan nsz arcp contract afn float %.i2135, %.i2124
- %.i3137 = extractelement <4 x float> %67, i64 3
+ %.i3137 = extractelement <4 x float> %i67, i64 3
%.i3138 = fadd reassoc nnan nsz arcp contract afn float %.i3137, %.i3126
- %68 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 208, i32 0), !invariant.load !0
- %69 = shufflevector <2 x i32> %68, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %70 = bitcast <4 x i32> %69 to <4 x float>
- %.i0139 = extractelement <4 x float> %70, i64 0
- %.i1140 = extractelement <4 x float> %70, i64 1
- %.i0141 = fadd reassoc nnan nsz arcp contract afn float %.i0139, %19
- %.i1142 = fadd reassoc nnan nsz arcp contract afn float %.i1140, %16
- %71 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0141, float %.i1142, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i0143 = extractelement <4 x float> %71, i64 0
+ %i68 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 208, i32 0), !invariant.load !0
+ %i69 = shufflevector <2 x i32> %i68, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i70 = bitcast <4 x i32> %i69 to <4 x float>
+ %.i0139 = extractelement <4 x float> %i70, i64 0
+ %.i1140 = extractelement <4 x float> %i70, i64 1
+ %.i0141 = fadd reassoc nnan nsz arcp contract afn float %.i0139, %i19
+ %.i1142 = fadd reassoc nnan nsz arcp contract afn float %.i1140, %i16
+ %i71 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0141, float %.i1142, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i0143 = extractelement <4 x float> %i71, i64 0
%.i0144 = fadd reassoc nnan nsz arcp contract afn float %.i0143, %.i0132
- %.i1145 = extractelement <4 x float> %71, i64 1
+ %.i1145 = extractelement <4 x float> %i71, i64 1
%.i1146 = fadd reassoc nnan nsz arcp contract afn float %.i1145, %.i1134
- %.i2147 = extractelement <4 x float> %71, i64 2
+ %.i2147 = extractelement <4 x float> %i71, i64 2
%.i2148 = fadd reassoc nnan nsz arcp contract afn float %.i2147, %.i2136
- %.i3149 = extractelement <4 x float> %71, i64 3
+ %.i3149 = extractelement <4 x float> %i71, i64 3
%.i3150 = fadd reassoc nnan nsz arcp contract afn float %.i3149, %.i3138
- %72 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 224, i32 0), !invariant.load !0
- %73 = shufflevector <2 x i32> %72, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %74 = bitcast <4 x i32> %73 to <4 x float>
- %.i0151 = extractelement <4 x float> %74, i64 0
- %.i1152 = extractelement <4 x float> %74, i64 1
- %.i0153 = fadd reassoc nnan nsz arcp contract afn float %.i0151, %19
- %.i1154 = fadd reassoc nnan nsz arcp contract afn float %.i1152, %16
- %75 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0153, float %.i1154, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i0155 = extractelement <4 x float> %75, i64 0
+ %i72 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 224, i32 0), !invariant.load !0
+ %i73 = shufflevector <2 x i32> %i72, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i74 = bitcast <4 x i32> %i73 to <4 x float>
+ %.i0151 = extractelement <4 x float> %i74, i64 0
+ %.i1152 = extractelement <4 x float> %i74, i64 1
+ %.i0153 = fadd reassoc nnan nsz arcp contract afn float %.i0151, %i19
+ %.i1154 = fadd reassoc nnan nsz arcp contract afn float %.i1152, %i16
+ %i75 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0153, float %.i1154, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i0155 = extractelement <4 x float> %i75, i64 0
%.i0156 = fadd reassoc nnan nsz arcp contract afn float %.i0155, %.i0144
- %.i1157 = extractelement <4 x float> %75, i64 1
+ %.i1157 = extractelement <4 x float> %i75, i64 1
%.i1158 = fadd reassoc nnan nsz arcp contract afn float %.i1157, %.i1146
- %.i2159 = extractelement <4 x float> %75, i64 2
+ %.i2159 = extractelement <4 x float> %i75, i64 2
%.i2160 = fadd reassoc nnan nsz arcp contract afn float %.i2159, %.i2148
- %.i3161 = extractelement <4 x float> %75, i64 3
+ %.i3161 = extractelement <4 x float> %i75, i64 3
%.i3162 = fadd reassoc nnan nsz arcp contract afn float %.i3161, %.i3150
- %76 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 240, i32 0), !invariant.load !0
- %77 = shufflevector <2 x i32> %76, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %78 = bitcast <4 x i32> %77 to <4 x float>
- %.i0163 = extractelement <4 x float> %78, i64 0
- %.i1164 = extractelement <4 x float> %78, i64 1
- %.i0165 = fadd reassoc nnan nsz arcp contract afn float %.i0163, %19
- %.i1166 = fadd reassoc nnan nsz arcp contract afn float %.i1164, %16
- %79 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0165, float %.i1166, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i0167 = extractelement <4 x float> %79, i64 0
+ %i76 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 240, i32 0), !invariant.load !0
+ %i77 = shufflevector <2 x i32> %i76, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i78 = bitcast <4 x i32> %i77 to <4 x float>
+ %.i0163 = extractelement <4 x float> %i78, i64 0
+ %.i1164 = extractelement <4 x float> %i78, i64 1
+ %.i0165 = fadd reassoc nnan nsz arcp contract afn float %.i0163, %i19
+ %.i1166 = fadd reassoc nnan nsz arcp contract afn float %.i1164, %i16
+ %i79 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0165, float %.i1166, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i0167 = extractelement <4 x float> %i79, i64 0
%.i0168 = fadd reassoc nnan nsz arcp contract afn float %.i0167, %.i0156
- %.i1169 = extractelement <4 x float> %79, i64 1
+ %.i1169 = extractelement <4 x float> %i79, i64 1
%.i1170 = fadd reassoc nnan nsz arcp contract afn float %.i1169, %.i1158
- %.i2171 = extractelement <4 x float> %79, i64 2
+ %.i2171 = extractelement <4 x float> %i79, i64 2
%.i2172 = fadd reassoc nnan nsz arcp contract afn float %.i2171, %.i2160
- %.i3173 = extractelement <4 x float> %79, i64 3
+ %.i3173 = extractelement <4 x float> %i79, i64 3
%.i3174 = fadd reassoc nnan nsz arcp contract afn float %.i3173, %.i3162
- %80 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %5, i32 256, i32 0), !invariant.load !0
- %81 = shufflevector <2 x i32> %80, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
- %82 = bitcast <4 x i32> %81 to <4 x float>
- %.i0175 = extractelement <4 x float> %82, i64 0
- %.i1176 = extractelement <4 x float> %82, i64 1
- %.i0177 = fadd reassoc nnan nsz arcp contract afn float %.i0175, %19
- %.i1178 = fadd reassoc nnan nsz arcp contract afn float %.i1176, %16
- %83 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %.i0177, float %.i1178, <8 x i32> %13, <4 x i32> %9, i1 false, i32 0, i32 0)
- %.i0179 = extractelement <4 x float> %83, i64 0
+ %i80 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 256, i32 0), !invariant.load !0
+ %i81 = shufflevector <2 x i32> %i80, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i82 = bitcast <4 x i32> %i81 to <4 x float>
+ %.i0175 = extractelement <4 x float> %i82, i64 0
+ %.i1176 = extractelement <4 x float> %i82, i64 1
+ %.i0177 = fadd reassoc nnan nsz arcp contract afn float %.i0175, %i19
+ %.i1178 = fadd reassoc nnan nsz arcp contract afn float %.i1176, %i16
+ %i83 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0177, float %.i1178, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
+ %.i0179 = extractelement <4 x float> %i83, i64 0
%.i0180 = fadd reassoc nnan nsz arcp contract afn float %.i0179, %.i0168
- %.i1181 = extractelement <4 x float> %83, i64 1
+ %.i1181 = extractelement <4 x float> %i83, i64 1
%.i1182 = fadd reassoc nnan nsz arcp contract afn float %.i1181, %.i1170
- %.i2183 = extractelement <4 x float> %83, i64 2
+ %.i2183 = extractelement <4 x float> %i83, i64 2
%.i2184 = fadd reassoc nnan nsz arcp contract afn float %.i2183, %.i2172
- %.i3185 = extractelement <4 x float> %83, i64 3
+ %.i3185 = extractelement <4 x float> %i83, i64 3
%.i3186 = fadd reassoc nnan nsz arcp contract afn float %.i3185, %.i3174
- %84 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i0180, float %.i1182)
- %85 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i2184, float %.i3186)
- %86 = bitcast <2 x half> %84 to float
- %87 = bitcast <2 x half> %85 to float
- call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float %86, float %87, float poison, float poison, i1 true, i1 true)
+ %i84 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i0180, float %.i1182)
+ %i85 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i2184, float %.i3186)
+ %i86 = bitcast <2 x half> %i84 to float
+ %i87 = bitcast <2 x half> %i85 to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float %i86, float %i87, float poison, float poison, i1 true, i1 true)
ret void
}
>From 5439ff5b957a8dea741010ceff194238e57a987a Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Tue, 26 Nov 2024 16:11:14 +0800
Subject: [PATCH 5/8] Also add a function attribute to control it
The graphics frontend could not set command-line option per function.
Add the function attribute to allow the frontend to set it per
function.
---
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f121f524afe543..0c46c66dd38983 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1311,7 +1311,10 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
if (EnableMaxIlpSchedStrategy)
return createGCNMaxILPMachineScheduler(C);
- if (EnableMaxMemoryClauseSchedStrategy)
+
+ if (EnableMaxMemoryClauseSchedStrategy ||
+ C->MF->getFunction().hasFnAttribute(
+ "amdgpu-enable-max-memory-clause-scheduling-strategy"))
return createGCNMaxMemoryClauseMachineScheduler(C);
return createGCNMaxOccupancyMachineScheduler(C);
>From 73d4b9169de0ff8a8d1310443b759c6a6cca0373 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Wed, 27 Nov 2024 14:01:22 +0800
Subject: [PATCH 6/8] Add amdgpu-sched-strategy for selecting sched strategy
---
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 +++++++---
llvm/test/CodeGen/AMDGPU/group-image-instructions.ll | 4 ++--
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0c46c66dd38983..416f7e6f46b264 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1309,12 +1309,16 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
- if (EnableMaxIlpSchedStrategy)
+ Attribute SchedStrategyAttr =
+ C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
+ StringRef SchedStrategy =
+ SchedStrategyAttr.isValid() ? SchedStrategyAttr.getValueAsString() : "";
+
+ if (EnableMaxIlpSchedStrategy || SchedStrategy == "max-ilp")
return createGCNMaxILPMachineScheduler(C);
if (EnableMaxMemoryClauseSchedStrategy ||
- C->MF->getFunction().hasFnAttribute(
- "amdgpu-enable-max-memory-clause-scheduling-strategy"))
+ SchedStrategy == "max-memory-clause")
return createGCNMaxMemoryClauseMachineScheduler(C);
return createGCNMaxOccupancyMachineScheduler(C);
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
index 5ec4a6ed848156..f68b8161707e40 100644
--- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-max-memory-clause-scheduling-strategy -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
; GFX11-LABEL: group_image_sample:
@@ -479,7 +479,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8
-attributes #2 = { alwaysinline nounwind memory(readwrite) "InitialPSInputAddr"="2" "amdgpu-color-export"="1" "amdgpu-depth-export"="0" "amdgpu-memory-bound"="false" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,-cumode" }
+attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "InitialPSInputAddr"="2" "amdgpu-color-export"="1" "amdgpu-depth-export"="0" "amdgpu-memory-bound"="false" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,-cumode" }
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
>From 2df857cd30f2f58387bbd8c5a4343574fde3142c Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Wed, 4 Dec 2024 15:22:26 +0800
Subject: [PATCH 7/8] Introduce a new commandline option: mdgpu-sched-strategy
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 24 +++++++------------
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 1 +
.../AMDGPU/schedule-ilp-liveness-tracking.mir | 2 +-
llvm/test/CodeGen/AMDGPU/schedule-ilp.ll | 2 +-
4 files changed, 12 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 416f7e6f46b264..9d72497716d797 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -429,16 +429,10 @@ static cl::opt<bool>
cl::desc("Enable loop data prefetch on AMDGPU"),
cl::Hidden, cl::init(false));
-static cl::opt<bool> EnableMaxIlpSchedStrategy(
- "amdgpu-enable-max-ilp-scheduling-strategy",
- cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> EnableMaxMemoryClauseSchedStrategy(
- "amdgpu-enable-max-memory-clause-scheduling-strategy",
- cl::desc("Enable scheduling strategy to maximize memory clause for a "
- "single wave."),
- cl::Hidden, cl::init(false));
+static cl::opt<std::string>
+ AMDGPUSchedStrategy("amdgpu-sched-strategy",
+ cl::desc("Select custom AMDGPU scheduling strategy."),
+ cl::Hidden, cl::init(""));
static cl::opt<bool> EnableRewritePartialRegUses(
"amdgpu-enable-rewrite-partial-reg-uses",
@@ -1311,14 +1305,14 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
Attribute SchedStrategyAttr =
C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
- StringRef SchedStrategy =
- SchedStrategyAttr.isValid() ? SchedStrategyAttr.getValueAsString() : "";
+ StringRef SchedStrategy = SchedStrategyAttr.isValid()
+ ? SchedStrategyAttr.getValueAsString()
+ : AMDGPUSchedStrategy;
- if (EnableMaxIlpSchedStrategy || SchedStrategy == "max-ilp")
+ if (SchedStrategy == "max-ilp")
return createGCNMaxILPMachineScheduler(C);
- if (EnableMaxMemoryClauseSchedStrategy ||
- SchedStrategy == "max-memory-clause")
+ if (SchedStrategy == "max-memory-clause")
return createGCNMaxMemoryClauseMachineScheduler(C);
return createGCNMaxOccupancyMachineScheduler(C);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 7d7bac50009eed..ad0e4bf48dd77e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -620,6 +620,7 @@ GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
}
+
/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
/// much as possible. This is achieved by:
// 1. Prioritize clustered operations before stall latency heuristic.
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir b/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir
index 4b6e204ecf9570..c2cd4653bc9bf3 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp-liveness-tracking.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-enable-max-ilp-scheduling-strategy -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-sched-strategy=max-ilp -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
---
name: max-ilp-liveness-tracking
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
index 11602b1d353f91..350ff94373a725 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
@@ -1,6 +1,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-max-ilp -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-enable-max-ilp-scheduling-strategy -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=max-ilp -verify-machineinstrs < %s | FileCheck %s
; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
>From 87479b03d69d5e877775a097d2516042acbd0b11 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Thu, 5 Dec 2024 16:32:38 +0800
Subject: [PATCH 8/8] Refine code and test
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 24 +-
.../AMDGPU/group-image-instructions.ll | 224 ++++++++----------
2 files changed, 109 insertions(+), 139 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ad0e4bf48dd77e..4637631cd01f61 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -645,17 +645,17 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
- // Avoid exceeding the target's limit.
- if (DAG->isTrackingPressure() &&
- tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
- RegExcess, TRI, DAG->MF))
- return TryCand.Reason != NoCand;
+ if (DAG->isTrackingPressure()) {
+ // Avoid exceeding the target's limit.
+ if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
- // Avoid increasing the max critical pressure in the scheduled region.
- if (DAG->isTrackingPressure() &&
- tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
- TryCand, Cand, RegCritical, TRI, DAG->MF))
- return TryCand.Reason != NoCand;
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+ }
// MaxMemoryClause-specific: We prioritize clustered instructions as we would
// get more benefit from clausing these memory instructions.
@@ -737,8 +737,8 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
return TryCand.Reason != NoCand;
// Fall through to original instruction order.
- if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
- (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+ if ((Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum))) {
+ assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
TryCand.Reason = NodeOrder;
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
index f68b8161707e40..32356cd8b04e18 100644
--- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
; GFX11-LABEL: group_image_sample:
; GFX11: ; %bb.0: ; %.entry
-; GFX11-NEXT: s_mov_b64 s[16:17], exec
-; GFX11-NEXT: s_wqm_b64 exec, exec
+; GFX11-NEXT: s_mov_b32 s16, exec_lo
+; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: s_mov_b32 m0, s4
; GFX11-NEXT: s_getpc_b64 s[4:5]
; GFX11-NEXT: s_mov_b32 s0, s1
@@ -16,11 +16,11 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x0
; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
; GFX11-NEXT: s_load_b256 s[0:7], s[6:7], 0x0
-; GFX11-NEXT: s_mov_b64 s[18:19], exec
-; GFX11-NEXT: s_wqm_b64 exec, exec
+; GFX11-NEXT: s_mov_b32 s17, exec_lo
+; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15
; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
-; GFX11-NEXT: s_mov_b64 exec, s[18:19]
+; GFX11-NEXT: s_mov_b32 exec_lo, s17
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x10
@@ -30,21 +30,17 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_interp_p2_f32 v45, v2, v1, v4 wait_exp:7
-; GFX11-NEXT: v_interp_p2_f32 v44, v3, v1, v0 wait_exp:7
+; GFX11-NEXT: v_interp_p2_f32 v61, v2, v1, v4 wait_exp:7
+; GFX11-NEXT: v_interp_p2_f32 v60, v3, v1, v0 wait_exp:7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_f32_e32 v0, s18, v44
-; GFX11-NEXT: v_add_f32_e32 v1, s19, v45
-; GFX11-NEXT: v_add_f32_e32 v8, s20, v44
-; GFX11-NEXT: v_add_f32_e32 v9, s21, v45
-; GFX11-NEXT: v_add_f32_e32 v16, s24, v44
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v0, s18, v60 :: v_dual_add_f32 v1, s19, v61
+; GFX11-NEXT: v_dual_add_f32 v8, s20, v60 :: v_dual_add_f32 v9, s21, v61
+; GFX11-NEXT: v_dual_add_f32 v16, s24, v60 :: v_dual_add_f32 v17, s25, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[4:7], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: v_add_f32_e32 v0, s22, v44
-; GFX11-NEXT: v_add_f32_e32 v1, s23, v45
-; GFX11-NEXT: v_add_f32_e32 v17, s25, v45
+; GFX11-NEXT: v_dual_add_f32 v0, s22, v60 :: v_dual_add_f32 v1, s23, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[12:15], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@@ -54,135 +50,109 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x70
; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x80
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v0, s18, v44
-; GFX11-NEXT: v_add_f32_e32 v1, s19, v45
-; GFX11-NEXT: v_add_f32_e32 v24, s20, v44
-; GFX11-NEXT: v_add_f32_e32 v25, s21, v45
+; GFX11-NEXT: v_dual_add_f32 v0, s18, v60 :: v_dual_add_f32 v1, s19, v61
+; GFX11-NEXT: v_dual_add_f32 v24, s20, v60 :: v_dual_add_f32 v25, s21, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[20:23], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: v_dual_add_f32 v0, s22, v60 :: v_dual_add_f32 v1, s23, v61
+; GFX11-NEXT: v_dual_add_f32 v32, s24, v60 :: v_dual_add_f32 v33, s25, v61
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample v[28:31], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x90
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xa0
-; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0xb0
-; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0xc0
-; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0xd0
-; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0xe0
-; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xf0
-; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
-; GFX11-NEXT: v_add_f32_e32 v0, s22, v44
-; GFX11-NEXT: v_add_f32_e32 v1, s23, v45
-; GFX11-NEXT: v_add_f32_e32 v28, s24, v44
-; GFX11-NEXT: v_add_f32_e32 v29, s25, v45
+; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xb0
+; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0xc0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v30, s18, v44
-; GFX11-NEXT: v_add_f32_e32 v31, s19, v45
-; GFX11-NEXT: v_add_f32_e32 v32, s20, v44
-; GFX11-NEXT: v_add_f32_e32 v33, s21, v45
-; GFX11-NEXT: v_add_f32_e32 v34, s26, v44
-; GFX11-NEXT: v_add_f32_e32 v35, s27, v45
-; GFX11-NEXT: v_add_f32_e32 v36, s28, v44
-; GFX11-NEXT: v_add_f32_e32 v37, s29, v45
-; GFX11-NEXT: v_add_f32_e32 v38, s30, v44
-; GFX11-NEXT: v_add_f32_e32 v39, s31, v45
-; GFX11-NEXT: v_add_f32_e32 v40, s34, v44
-; GFX11-NEXT: v_add_f32_e32 v41, s35, v45
-; GFX11-NEXT: v_add_f32_e32 v42, s36, v44
-; GFX11-NEXT: v_add_f32_e32 v43, s37, v45
-; GFX11-NEXT: v_add_f32_e32 v44, s12, v44
-; GFX11-NEXT: v_add_f32_e32 v45, s13, v45
-; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: v_add_f32_e32 v46, v8, v4
-; GFX11-NEXT: v_add_f32_e32 v47, v9, v5
-; GFX11-NEXT: v_add_f32_e32 v48, v10, v6
-; GFX11-NEXT: v_add_f32_e32 v49, v11, v7
-; GFX11-NEXT: s_and_b64 exec, exec, s[16:17]
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v[4:7], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v[8:11], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: v_add_f32_e32 v0, v12, v46
-; GFX11-NEXT: v_add_f32_e32 v1, v13, v47
-; GFX11-NEXT: v_add_f32_e32 v46, v14, v48
-; GFX11-NEXT: v_add_f32_e32 v47, v15, v49
+; GFX11-NEXT: v_dual_add_f32 v0, s18, v60 :: v_dual_add_f32 v1, s19, v61
+; GFX11-NEXT: v_dual_add_f32 v40, s20, v60 :: v_dual_add_f32 v41, s21, v61
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v[12:15], v[30:31], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v[28:31], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: v_add_f32_e32 v0, v16, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v17, v1
-; GFX11-NEXT: v_add_f32_e32 v46, v18, v46
-; GFX11-NEXT: v_add_f32_e32 v47, v19, v47
+; GFX11-NEXT: image_sample v[36:39], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_dual_add_f32 v0, s22, v60 :: v_dual_add_f32 v1, s23, v61
+; GFX11-NEXT: v_dual_add_f32 v48, s24, v60 :: v_dual_add_f32 v49, s25, v61
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v[16:19], v[34:35], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v[32:35], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: v_add_f32_e32 v0, v20, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v21, v1
-; GFX11-NEXT: v_add_f32_e32 v46, v22, v46
-; GFX11-NEXT: v_add_f32_e32 v47, v23, v47
+; GFX11-NEXT: image_sample v[44:47], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xd0
+; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xe0
+; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xf0
+; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_add_f32 v0, s18, v60 :: v_dual_add_f32 v1, s19, v61
+; GFX11-NEXT: v_dual_add_f32 v56, s20, v60 :: v_dual_add_f32 v57, s21, v61
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v[20:23], v[38:39], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v[36:39], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: v_add_f32_e32 v0, v24, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v25, v1
-; GFX11-NEXT: v_add_f32_e32 v46, v26, v46
-; GFX11-NEXT: v_add_f32_e32 v47, v27, v47
+; GFX11-NEXT: image_sample v[52:55], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_dual_add_f32 v0, s22, v60 :: v_dual_add_f32 v1, s23, v61
+; GFX11-NEXT: v_dual_add_f32 v64, s12, v60 :: v_dual_add_f32 v65, s13, v61
+; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s16
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: image_sample v[24:27], v[42:43], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: image_sample v[40:43], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[60:63], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample v[64:67], v[64:65], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_waitcnt vmcnt(14)
+; GFX11-NEXT: v_dual_add_f32 v0, v8, v4 :: v_dual_add_f32 v1, v9, v5
+; GFX11-NEXT: v_dual_add_f32 v4, v10, v6 :: v_dual_add_f32 v5, v11, v7
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v12, v0 :: v_dual_add_f32 v1, v13, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v14, v4 :: v_dual_add_f32 v5, v15, v5
+; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v16, v0 :: v_dual_add_f32 v1, v17, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v18, v4 :: v_dual_add_f32 v5, v19, v5
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v20, v0 :: v_dual_add_f32 v1, v21, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v22, v4 :: v_dual_add_f32 v5, v23, v5
+; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v24, v0 :: v_dual_add_f32 v1, v25, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v26, v4 :: v_dual_add_f32 v5, v27, v5
; GFX11-NEXT: s_waitcnt vmcnt(9)
-; GFX11-NEXT: v_add_f32_e32 v0, v4, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v5, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v6, v46
-; GFX11-NEXT: v_add_f32_e32 v5, v7, v47
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v28, v0 :: v_dual_add_f32 v1, v29, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v30, v4 :: v_dual_add_f32 v5, v31, v5
; GFX11-NEXT: s_waitcnt vmcnt(8)
-; GFX11-NEXT: v_add_f32_e32 v0, v8, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v9, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v11, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v32, v0 :: v_dual_add_f32 v1, v33, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v34, v4 :: v_dual_add_f32 v5, v35, v5
; GFX11-NEXT: s_waitcnt vmcnt(7)
-; GFX11-NEXT: v_add_f32_e32 v0, v12, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v13, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v14, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v15, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v36, v0 :: v_dual_add_f32 v1, v37, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v38, v4 :: v_dual_add_f32 v5, v39, v5
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: v_add_f32_e32 v0, v28, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v29, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v30, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v31, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v40, v0 :: v_dual_add_f32 v1, v41, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v42, v4 :: v_dual_add_f32 v5, v43, v5
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: v_add_f32_e32 v0, v16, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v17, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v18, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v19, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v44, v0 :: v_dual_add_f32 v1, v45, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v46, v4 :: v_dual_add_f32 v5, v47, v5
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: v_add_f32_e32 v0, v32, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v33, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v34, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v35, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v48, v0 :: v_dual_add_f32 v1, v49, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v50, v4 :: v_dual_add_f32 v5, v51, v5
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: v_add_f32_e32 v0, v20, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v21, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v22, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v23, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v52, v0 :: v_dual_add_f32 v1, v53, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v54, v4 :: v_dual_add_f32 v5, v55, v5
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: v_add_f32_e32 v0, v36, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v37, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v38, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v39, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v56, v0 :: v_dual_add_f32 v1, v57, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v58, v4 :: v_dual_add_f32 v5, v59, v5
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_add_f32_e32 v0, v24, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v25, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v26, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v27, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v60, v0 :: v_dual_add_f32 v1, v61, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v62, v4 :: v_dual_add_f32 v5, v63, v5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v0, v40, v0
-; GFX11-NEXT: v_add_f32_e32 v1, v41, v1
-; GFX11-NEXT: v_add_f32_e32 v4, v42, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v43, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v0, v64, v0 :: v_dual_add_f32 v1, v65, v1
+; GFX11-NEXT: v_dual_add_f32 v4, v66, v4 :: v_dual_add_f32 v5, v67, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v0, v0, v1
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v4, v5
; GFX11-NEXT: exp mrt0 v0, v1, off, off done
@@ -479,7 +449,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8
-attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "InitialPSInputAddr"="2" "amdgpu-color-export"="1" "amdgpu-depth-export"="0" "amdgpu-memory-bound"="false" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,-cumode" }
+attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"}
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
More information about the llvm-commits
mailing list