[llvm] [AMDGPU] Add s_delay_alu in relocated PC-relative symbol sequence (PR #71061)
Nikita Popov via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 3 03:40:18 PDT 2023
https://github.com/nikic updated https://github.com/llvm/llvm-project/pull/71061
>From 4a768823a0a119c38941c79de88fee2f4b9026dc Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 2 Nov 2023 13:43:20 +0000
Subject: [PATCH 1/3] [AMDGPU] Add s_delay_alu in relocated PC-relative symbol
sequence
Add an s_delay_alu in this sequence to account for the single cycle
delay before s0 can be used:
s_getpc_b64 s[0:1]
s_delay_alu instid0(SALU_CYCLE_1)
s_add_u32 s0, s0, symbol at lo+8
s_addc_u32 s1, s1, symbol at hi+16
AMDGPUInsertDelayAlu will not add insert this s_delay_alu instruction
automatically because it cannot modify the contents of bundles.
---
.../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 2 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 +-
.../GlobalISel/dynamic-alloca-uniform.ll | 15 +-
.../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 20 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 275 +++++++++++-------
.../CodeGen/AMDGPU/calling-conventions.ll | 10 +-
llvm/test/CodeGen/AMDGPU/cc-update.ll | 20 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 100 ++++---
.../AMDGPU/global_atomics_scan_fmax.ll | 60 ++--
.../AMDGPU/global_atomics_scan_fmin.ll | 60 ++--
.../AMDGPU/global_atomics_scan_fsub.ll | 100 ++++---
.../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 15 +-
.../AMDGPU/llvm.amdgcn.s.buffer.load.ll | 5 +-
.../AMDGPU/promote-constOffset-to-imm.ll | 40 ++-
.../propagate-attributes-bitcast-function.ll | 12 +-
llvm/test/CodeGen/AMDGPU/rel32.ll | 13 +-
...tack-pointer-offset-relative-frameindex.ll | 10 +-
.../CodeGen/AMDGPU/vgpr-tuple-allocation.ll | 10 +-
18 files changed, 473 insertions(+), 307 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 7619a39bac9c142..b417d8502603f30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -342,6 +342,8 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
+ // TODO: If the bundle already contains an s_delay_alu instruction, remember
+ // it in LastDelayAlu so we can try encoding another delay in it?
for (auto &MI : MBB.instrs()) {
if (MI.isBundle() || MI.isMetaInstruction())
continue;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f61735a59c97078..2743198766d578d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2405,13 +2405,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// the encoding of $symbol starts 12 bytes after the start of the s_add_u32
// instruction.
+ int64_t Adjust = 0;
+ if (ST.hasDelayAlu()) {
+ // Manually add the 1 cycle delay before using RegLo. AMDGPUInsertDelayAlu
+ // will not add this automatically inside a bundle:
+ // s_delay_alu instid0(SALU_CYCLE_1)
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_DELAY_ALU)).addImm(9));
+ Adjust += 4;
+ }
+
if (OpLo.isGlobal())
- OpLo.setOffset(OpLo.getOffset() + 4);
+ OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
Bundler.append(
BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
if (OpHi.isGlobal())
- OpHi.setOffset(OpHi.getOffset() + 12);
+ OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
.addReg(RegHi)
.add(OpHi));
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index 96db1f889690df6..c31f4e48c672b30 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -117,8 +117,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s33, s2
@@ -252,8 +253,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s33, s2
@@ -395,8 +397,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX11-NEXT: s_add_i32 s32, s32, 64
; GFX11-NEXT: s_and_not1_b32 s33, s33, 31
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s33, s2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 7beaf3103586375..b64da0ff7c4b7e9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -36,8 +36,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT: s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, use at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT: s_add_u32 s4, s4, use at gotpcrel32@lo+8
+; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, use at gotpcrel32@hi+16
; GISEL-GFX11-NEXT: v_dual_mov_b32 v4, v8 :: v_dual_mov_b32 v5, v9
; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GISEL-GFX11-NEXT: v_dual_mov_b32 v6, v10 :: v_dual_mov_b32 v7, v11
@@ -72,8 +73,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
; DAGISEL-GFX11: ; %bb.0:
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, use at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, use at gotpcrel32@lo+8
+; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, use at gotpcrel32@hi+16
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v7, v11 :: v_dual_mov_b32 v6, v10
; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v5, v9 :: v_dual_mov_b32 v4, v8
@@ -141,8 +143,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
; GISEL-GFX11-NEXT: scratch_store_b32 off, v28, s24
; GISEL-GFX11-NEXT: scratch_store_b32 off, v29, s25
; GISEL-GFX11-NEXT: s_getpc_b64 s[24:25]
-; GISEL-GFX11-NEXT: s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT: s_addc_u32 s25, s25, use at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT: s_add_u32 s24, s24, use at gotpcrel32@lo+8
+; GISEL-GFX11-NEXT: s_addc_u32 s25, s25, use at gotpcrel32@hi+16
; GISEL-GFX11-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
; GISEL-GFX11-NEXT: s_load_b64 s[24:25], s[24:25], 0x0
; GISEL-GFX11-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
@@ -272,8 +275,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v20, s24
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v19, s25
; DAGISEL-GFX11-NEXT: s_getpc_b64 s[24:25]
-; DAGISEL-GFX11-NEXT: s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT: s_addc_u32 s25, s25, use at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-GFX11-NEXT: s_add_u32 s24, s24, use at gotpcrel32@lo+8
+; DAGISEL-GFX11-NEXT: s_addc_u32 s25, s25, use at gotpcrel32@hi+16
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
; DAGISEL-GFX11-NEXT: s_load_b64 s[24:25], s[24:25], 0x0
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 4e2b83af7f5e20b..f2aa24c1a7f7175 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -124,8 +124,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -228,8 +229,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+16
; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -338,8 +340,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+16
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -432,8 +435,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -534,8 +538,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -639,8 +644,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -732,8 +738,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -833,8 +840,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -938,8 +946,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1031,8 +1040,9 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1123,8 +1133,9 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1229,8 +1240,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1333,8 +1345,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1448,8 +1461,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1576,8 +1590,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1675,8 +1690,9 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1764,8 +1780,9 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1856,8 +1873,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1953,8 +1971,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2058,8 +2077,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5f32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5f32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5f32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5f32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2154,8 +2174,9 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2254,8 +2275,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2363,8 +2385,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2467,8 +2490,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2571,8 +2595,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2676,8 +2701,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2772,8 +2798,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2867,8 +2894,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -2970,8 +2998,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3068,8 +3097,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3170,8 +3200,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3270,8 +3301,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3365,8 +3397,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3462,8 +3495,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3563,8 +3597,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32_i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32_i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32_i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3663,8 +3698,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3765,8 +3801,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3871,8 +3908,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -3980,8 +4018,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
@@ -4104,8 +4143,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -4222,8 +4262,9 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i32 at rel32@hi+16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
@@ -4366,8 +4407,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32 at rel32@hi+16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
@@ -4531,8 +4573,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32_i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32_i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32_i32 at rel32@hi+16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
@@ -4673,8 +4716,9 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; GFX11-NEXT: s_mov_b32 s39, 0x31016000
; GFX11-NEXT: s_mov_b32 s38, -1
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_i32_func_i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_i32_func_i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_i32_func_i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_i32_func_i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc
@@ -4788,8 +4832,9 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_u8 v0, off, s[4:7], 0
@@ -4916,8 +4961,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; GFX11-NEXT: s_mov_b32 s32, 16
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+16
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b8 off, v0, off offset:8
; GFX11-NEXT: scratch_store_b32 off, v1, off offset:12
@@ -5082,8 +5128,9 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; GFX11-NEXT: s_mov_b32 s32, 32
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+16
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b8 off, v0, off offset:8
; GFX11-NEXT: scratch_store_b32 off, v1, off offset:12
@@ -5292,8 +5339,9 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
@@ -5573,8 +5621,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; GFX11-NEXT: v_mov_b32_e32 v30, s18
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, stack_passed_f64_arg at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, stack_passed_f64_arg at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, stack_passed_f64_arg at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, stack_passed_f64_arg at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -5696,8 +5745,9 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg at rel32@hi+16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b32 off, v31, s32
; GFX11-NEXT: scratch_load_b64 v[31:32], off, s32 offset:24
@@ -5786,8 +5836,9 @@ define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, do
; GFX11-NEXT: scratch_load_b32 v33, off, s32
; GFX11-NEXT: scratch_load_b64 v[31:32], off, s32 offset:4
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+16
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b32 off, v33, s32
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -6055,8 +6106,9 @@ define void @stack_12xv3i32() #0 {
; GFX11-NEXT: v_mov_b32_e32 v30, 10
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
@@ -6398,8 +6450,9 @@ define void @stack_12xv3f32() #0 {
; GFX11-NEXT: v_mov_b32_e32 v30, 0x41200000
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3f32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3f32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3f32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3f32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
@@ -6766,8 +6819,9 @@ define void @stack_8xv5i32() #0 {
; GFX11-NEXT: v_mov_b32_e32 v30, 6
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
@@ -7141,8 +7195,9 @@ define void @stack_8xv5f32() #0 {
; GFX11-NEXT: v_mov_b32_e32 v30, 0x40c00000
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index cb89841b58f9787..dd1b0878d48c1d2 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -159,8 +159,9 @@ define amdgpu_kernel void @call_coldcc() #0 {
; GFX11-LABEL: call_coldcc:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, coldcc at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, coldcc at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, coldcc at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, coldcc at gotpcrel32@hi+16
; GFX11-NEXT: v_mov_b32_e32 v0, 1.0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s32, 0
@@ -223,8 +224,9 @@ define amdgpu_kernel void @call_fastcc() #0 {
; GFX11-LABEL: call_fastcc:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, fastcc at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, fastcc at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, fastcc at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, fastcc at gotpcrel32@hi+16
; GFX11-NEXT: v_mov_b32_e32 v0, 1.0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index 8a69069d75e2c68..9215f9b23ecb46b 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -138,8 +138,9 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+8
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+16
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
@@ -234,8 +235,9 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1100-NEXT: scratch_store_b32 off, v1, off offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+8
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+16
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
@@ -393,8 +395,9 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX1100-NEXT: s_mov_b32 s32, 0
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+8
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+16
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
; GFX1010-NEXT s_add_u32 s12, s12, s17
@@ -511,8 +514,9 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1100-NEXT: scratch_store_b32 off, v1, s33 offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+8
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+16
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 4cbd5e84871cc75..927d12260a586f9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -598,8 +598,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -651,8 +652,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -947,8 +949,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -1018,8 +1021,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -1804,8 +1808,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -1868,8 +1873,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -2175,8 +2181,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -2257,8 +2264,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3054,8 +3062,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -3107,8 +3116,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3403,8 +3413,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -3474,8 +3485,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3802,8 +3814,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -3855,8 +3868,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -4151,8 +4165,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -4222,8 +4237,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -5007,8 +5023,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -5071,8 +5088,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -5378,8 +5396,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -5460,8 +5479,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index e07514b063ee481..1377e24a76de653 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -629,8 +629,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -698,8 +699,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -1036,8 +1038,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -1131,8 +1134,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -1828,8 +1832,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -1897,8 +1902,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -2235,8 +2241,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -2330,8 +2337,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3027,8 +3035,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -3096,8 +3105,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3434,8 +3444,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -3529,8 +3540,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index ad6edbd2c37a09d..1a1a34109340ec5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -629,8 +629,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -698,8 +699,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -1036,8 +1038,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -1131,8 +1134,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -1828,8 +1832,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -1897,8 +1902,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -2235,8 +2241,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -2330,8 +2337,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3027,8 +3035,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -3096,8 +3105,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3434,8 +3444,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -3529,8 +3540,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 4a00d7bc71bca8a..e7720e88317c071 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -650,8 +650,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -714,8 +715,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -1021,8 +1023,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -1103,8 +1106,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -1900,8 +1904,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -1964,8 +1969,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -2271,8 +2277,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -2353,8 +2360,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3150,8 +3158,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -3214,8 +3223,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3521,8 +3531,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -3603,8 +3614,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -3942,8 +3954,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -4006,8 +4019,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -4313,8 +4327,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -4395,8 +4410,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -5191,8 +5207,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1164-NEXT: s_mov_b32 s13, s9
; GFX1164-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-NEXT: s_mov_b32 s14, s10
@@ -5255,8 +5272,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1132-NEXT: s_add_u32 s8, s34, 44
; GFX1132-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-NEXT: s_mov_b32 s12, s13
; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -5562,8 +5580,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
@@ -5644,8 +5663,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 220ea962b9e1dca..50634733bebad9e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -14,8 +14,9 @@ define void @f0() {
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, f1 at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, f1 at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, f1 at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, f1 at gotpcrel32@hi+16
; GFX11-NEXT: v_writelane_b32 v4, s30, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v4, s31, 1
@@ -81,8 +82,9 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_add_u32 s8, s16, 0x58
; GFX11-NEXT: s_addc_u32 s9, s17, 0
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, f0 at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, f0 at gotpcrel32@hi+16
; GFX11-NEXT: s_mov_b32 s13, s14
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s3, s14
@@ -179,8 +181,9 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_add_u32 s8, s16, 0x58
; GFX11-NEXT: s_addc_u32 s9, s17, 0
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, f0 at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, f0 at gotpcrel32@hi+16
; GFX11-NEXT: s_mov_b32 s13, s14
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s14, s15
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index 970c2c1c0456e0e..cbb31120d637c5c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -580,8 +580,9 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32
; GFX11-LABEL: s_buffer_load_index_across_bb:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_getpc_b64 s[4:5]
-; GFX11-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s4, s4, gv at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s5, s5, gv at gotpcrel32@hi+16
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 3cb03099da93d51..c85d8efe14f80a0 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -234,8 +234,9 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX11-LABEL: clmem_read_simplified:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24
@@ -812,8 +813,9 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-LABEL: clmem_read:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24
@@ -1242,8 +1244,9 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX11-LABEL: Address32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24
@@ -1502,8 +1505,9 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX11-LABEL: Offset64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24
@@ -1717,8 +1721,9 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX11-LABEL: p32Offset64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24
@@ -1968,8 +1973,9 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX11-LABEL: DiffBase:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_load_b128 s[36:39], s[0:1], 0x24
@@ -2285,8 +2291,9 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-LABEL: ReverseOrder:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24
@@ -2522,8 +2529,9 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX11-LABEL: negativeoffset:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
index 25a2924bef541a9..29155f6156d1db0 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
@@ -1,11 +1,15 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
; GCN: foo1:
; v_cndmask_b32_e64 v0, 0, 1, vcc_lo{{$}}
; GCN: kernel1:
-; GCN: foo1 at gotpcrel32@lo+4
-; GCN: foo1 at gotpcrel32@hi+12
+; GCN: s_getpc_b64
+; GFX10-NEXT: foo1 at gotpcrel32@lo+4
+; GFX10-NEXT: foo1 at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu
+; GFX11-NEXT: foo1 at gotpcrel32@lo+8
+; GFX11-NEXT: foo1 at gotpcrel32@hi+16
define void @foo1(i32 %x) #1 {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/rel32.ll b/llvm/test/CodeGen/AMDGPU/rel32.ll
index 41bf8f4ea8434ee..ef06c08d848d0c2 100644
--- a/llvm/test/CodeGen/AMDGPU/rel32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rel32.ll
@@ -1,13 +1,16 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,GFX910
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,GFX910
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,GFX11
@g = protected local_unnamed_addr addrspace(4) externally_initialized global i32 0, align 4
; CHECK-LABEL: rel32_neg_offset:
; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo-4
-; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+4
+; GFX910-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo-4
+; GFX910-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+4
+; GFX11-NEXT: s_delay_alu
+; GFX11-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo
+; GFX11-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+8
define ptr addrspace(4) @rel32_neg_offset() {
%r = getelementptr i32, ptr addrspace(4) @g, i64 -2
ret ptr addrspace(4) %r
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index d52655a0791615d..61ced07bb2e899b 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -87,8 +87,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; MUBUF11-NEXT: v_mov_b32_e32 v4, 0x400000
; MUBUF11-NEXT: s_movk_i32 s32, 0x6000
; MUBUF11-NEXT: s_getpc_b64 s[0:1]
-; MUBUF11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+4
-; MUBUF11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+12
+; MUBUF11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; MUBUF11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+8
+; MUBUF11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+16
; MUBUF11-NEXT: s_waitcnt lgkmcnt(0)
; MUBUF11-NEXT: v_mov_b32_e32 v0, s2
; MUBUF11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -114,8 +115,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; FLATSCR11-NEXT: v_mov_b32_e32 v4, 0x400000
; FLATSCR11-NEXT: s_movk_i32 s32, 0x6000
; FLATSCR11-NEXT: s_getpc_b64 s[0:1]
-; FLATSCR11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+4
-; FLATSCR11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+12
+; FLATSCR11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FLATSCR11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+8
+; FLATSCR11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+16
; FLATSCR11-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR11-NEXT: v_mov_b32_e32 v0, s2
; FLATSCR11-NEXT: s_swappc_b64 s[30:31], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 92efbe5a7182634..726c711273eb589 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -149,8 +149,9 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, extern_func at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+16
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
@@ -316,8 +317,9 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, extern_func at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+16
; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
>From db1b73003fab9abd7d1bb45f6eb03f690961b497 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 3 Nov 2023 10:20:19 +0000
Subject: [PATCH 2/3] Heed -amdgpu-enable-delay-alu=0
---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 ++++
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 4 ++++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
llvm/test/CodeGen/AMDGPU/cc-update.ll | 20 ++++++++-----------
...tack-pointer-offset-relative-frameindex.ll | 10 ++++------
5 files changed, 21 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 375df27206f7b41..9478e6dda9b312c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -292,6 +292,10 @@ static cl::opt<bool>
cl::desc("Enable s_delay_alu insertion"),
cl::init(true), cl::Hidden);
+bool llvm::ShouldInsertDelayAlu() {
+ return EnableInsertDelayAlu;
+}
+
// Enable GFX11+ VOPD
static cl::opt<bool>
EnableVOPD("amdgpu-enable-vopd",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 9051a61e65570cf..66a9a08cd4327e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -22,6 +22,10 @@
namespace llvm {
+/// Return true if GFX11+ s_delay_alu insertion has not been disabled by the
+/// command line option.
+bool ShouldInsertDelayAlu();
+
//===----------------------------------------------------------------------===//
// AMDGPU Target Machine (R600+)
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2743198766d578d..a95681ecef7630a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2406,7 +2406,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// instruction.
int64_t Adjust = 0;
- if (ST.hasDelayAlu()) {
+ if (ST.hasDelayAlu() && ShouldInsertDelayAlu()) {
// Manually add the 1 cycle delay before using RegLo. AMDGPUInsertDelayAlu
// will not add this automatically inside a bundle:
// s_delay_alu instid0(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index 9215f9b23ecb46b..8a69069d75e2c68 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -138,9 +138,8 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX1100-NEXT: s_mov_b32 s14, s15
; GFX1100-NEXT: s_mov_b32 s32, 0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+8
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+16
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
@@ -235,9 +234,8 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX1100-NEXT: scratch_store_b32 off, v1, off offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+8
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+16
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
@@ -395,9 +393,8 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX1100-NEXT: s_mov_b32 s32, 0
; GFX1100-NEXT: s_mov_b32 s33, 0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+8
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+16
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
; GFX1010-NEXT s_add_u32 s12, s12, s17
@@ -514,9 +511,8 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
; GFX1100-NEXT: scratch_store_b32 off, v1, s33 offset:4 dlc
; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1100-NEXT: s_getpc_b64 s[16:17]
-; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+8
-; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+16
+; GFX1100-NEXT: s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1100-NEXT: s_addc_u32 s17, s17, ex at rel32@hi+12
; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1100-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index 61ced07bb2e899b..d52655a0791615d 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -87,9 +87,8 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; MUBUF11-NEXT: v_mov_b32_e32 v4, 0x400000
; MUBUF11-NEXT: s_movk_i32 s32, 0x6000
; MUBUF11-NEXT: s_getpc_b64 s[0:1]
-; MUBUF11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; MUBUF11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+8
-; MUBUF11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+16
+; MUBUF11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+4
+; MUBUF11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+12
; MUBUF11-NEXT: s_waitcnt lgkmcnt(0)
; MUBUF11-NEXT: v_mov_b32_e32 v0, s2
; MUBUF11-NEXT: s_swappc_b64 s[30:31], s[0:1]
@@ -115,9 +114,8 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
; FLATSCR11-NEXT: v_mov_b32_e32 v4, 0x400000
; FLATSCR11-NEXT: s_movk_i32 s32, 0x6000
; FLATSCR11-NEXT: s_getpc_b64 s[0:1]
-; FLATSCR11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; FLATSCR11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+8
-; FLATSCR11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+16
+; FLATSCR11-NEXT: s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+4
+; FLATSCR11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+12
; FLATSCR11-NEXT: s_waitcnt lgkmcnt(0)
; FLATSCR11-NEXT: v_mov_b32_e32 v0, s2
; FLATSCR11-NEXT: s_swappc_b64 s[30:31], s[0:1]
>From 07bc714f0e8174dbd6234cb3feee53b6a1aa56df Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 3 Nov 2023 10:40:05 +0000
Subject: [PATCH 3/3] clang-format
---
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9478e6dda9b312c..bfd5cb2cb4d0e5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -292,9 +292,7 @@ static cl::opt<bool>
cl::desc("Enable s_delay_alu insertion"),
cl::init(true), cl::Hidden);
-bool llvm::ShouldInsertDelayAlu() {
- return EnableInsertDelayAlu;
-}
+bool llvm::ShouldInsertDelayAlu() { return EnableInsertDelayAlu; }
// Enable GFX11+ VOPD
static cl::opt<bool>
More information about the llvm-commits
mailing list