[llvm] [AMDGPU] Add s_delay_alu in relocated PC-relative symbol sequence (PR #71061)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 2 06:47:09 PDT 2023


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/71061

Add an s_delay_alu in this sequence to account for the single cycle
delay before s0 can be used:

  s_getpc_b64 s[0:1]
  s_delay_alu instid0(SALU_CYCLE_1)
  s_add_u32 s0, s0, symbol at lo+8
  s_addc_u32 s1, s1, symbol at hi+16

AMDGPUInsertDelayAlu will not add insert this s_delay_alu instruction
automatically because it cannot modify the contents of bundles.


>From 4a768823a0a119c38941c79de88fee2f4b9026dc Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 2 Nov 2023 13:43:20 +0000
Subject: [PATCH] [AMDGPU] Add s_delay_alu in relocated PC-relative symbol
 sequence

Add an s_delay_alu in this sequence to account for the single cycle
delay before s0 can be used:

  s_getpc_b64 s[0:1]
  s_delay_alu instid0(SALU_CYCLE_1)
  s_add_u32 s0, s0, symbol at lo+8
  s_addc_u32 s1, s1, symbol at hi+16

AMDGPUInsertDelayAlu will not add insert this s_delay_alu instruction
automatically because it cannot modify the contents of bundles.
---
 .../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp    |   2 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  13 +-
 .../GlobalISel/dynamic-alloca-uniform.ll      |  15 +-
 .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll |  20 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     | 275 +++++++++++-------
 .../CodeGen/AMDGPU/calling-conventions.ll     |  10 +-
 llvm/test/CodeGen/AMDGPU/cc-update.ll         |  20 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 100 ++++---
 .../AMDGPU/global_atomics_scan_fmax.ll        |  60 ++--
 .../AMDGPU/global_atomics_scan_fmin.ll        |  60 ++--
 .../AMDGPU/global_atomics_scan_fsub.ll        | 100 ++++---
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |  15 +-
 .../AMDGPU/llvm.amdgcn.s.buffer.load.ll       |   5 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |  40 ++-
 .../propagate-attributes-bitcast-function.ll  |  12 +-
 llvm/test/CodeGen/AMDGPU/rel32.ll             |  13 +-
 ...tack-pointer-offset-relative-frameindex.ll |  10 +-
 .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll   |  10 +-
 18 files changed, 473 insertions(+), 307 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 7619a39bac9c142..b417d8502603f30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -342,6 +342,8 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
 
     // Iterate over the contents of bundles, but don't emit any instructions
     // inside a bundle.
+    // TODO: If the bundle already contains an s_delay_alu instruction, remember
+    // it in LastDelayAlu so we can try encoding another delay in it?
     for (auto &MI : MBB.instrs()) {
       if (MI.isBundle() || MI.isMetaInstruction())
         continue;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f61735a59c97078..2743198766d578d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2405,13 +2405,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
     // instruction.
 
+    int64_t Adjust = 0;
+    if (ST.hasDelayAlu()) {
+      // Manually add the 1 cycle delay before using RegLo. AMDGPUInsertDelayAlu
+      // will not add this automatically inside a bundle:
+      //   s_delay_alu instid0(SALU_CYCLE_1)
+      Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_DELAY_ALU)).addImm(9));
+      Adjust += 4;
+    }
+
     if (OpLo.isGlobal())
-      OpLo.setOffset(OpLo.getOffset() + 4);
+      OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
     Bundler.append(
         BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
 
     if (OpHi.isGlobal())
-      OpHi.setOffset(OpHi.getOffset() + 12);
+      OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
                        .addReg(RegHi)
                        .add(OpHi));
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index 96db1f889690df6..c31f4e48c672b30 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -117,8 +117,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s33, s2
@@ -252,8 +253,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s33, s2
@@ -395,8 +397,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_add_i32 s32, s32, 64
 ; GFX11-NEXT:    s_and_not1_b32 s33, s33, 31
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s33, s2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 7beaf3103586375..b64da0ff7c4b7e9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -36,8 +36,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+8
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+16
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v4, v8 :: v_dual_mov_b32 v5, v9
 ; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v6, v10 :: v_dual_mov_b32 v7, v11
@@ -72,8 +73,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+8
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+16
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v7, v11 :: v_dual_mov_b32 v6, v10
 ; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v5, v9 :: v_dual_mov_b32 v4, v8
@@ -141,8 +143,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v28, s24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v29, s25
 ; GISEL-GFX11-NEXT:    s_getpc_b64 s[24:25]
-; GISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+8
+; GISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+16
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
 ; GISEL-GFX11-NEXT:    s_load_b64 s[24:25], s[24:25], 0x0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
@@ -272,8 +275,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v20, s24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v19, s25
 ; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[24:25]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+8
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+16
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
 ; DAGISEL-GFX11-NEXT:    s_load_b64 s[24:25], s[24:25], 0x0
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 4e2b83af7f5e20b..f2aa24c1a7f7175 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -124,8 +124,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -228,8 +229,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+16
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -338,8 +340,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+16
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -432,8 +435,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -534,8 +538,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -639,8 +644,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -732,8 +738,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -833,8 +840,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -938,8 +946,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1031,8 +1040,9 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1123,8 +1133,9 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1229,8 +1240,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1333,8 +1345,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1448,8 +1461,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1576,8 +1590,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1675,8 +1690,9 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1764,8 +1780,9 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1856,8 +1873,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1953,8 +1971,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2058,8 +2077,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5f32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5f32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2154,8 +2174,9 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2254,8 +2275,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2363,8 +2385,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2467,8 +2490,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2571,8 +2595,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2676,8 +2701,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2772,8 +2798,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2867,8 +2894,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -2970,8 +2998,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3068,8 +3097,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3170,8 +3200,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3270,8 +3301,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3365,8 +3397,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3462,8 +3495,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3563,8 +3597,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32_i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32_i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3663,8 +3698,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3765,8 +3801,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3871,8 +3908,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -3980,8 +4018,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
@@ -4104,8 +4143,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -4222,8 +4262,9 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i32 at rel32@hi+16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
@@ -4366,8 +4407,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32 at rel32@hi+16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x7
 ; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
@@ -4531,8 +4573,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32_i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32_i32 at rel32@hi+16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x8
 ; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
@@ -4673,8 +4716,9 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ; GFX11-NEXT:    s_mov_b32 s39, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s38, -1
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_i32_func_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_i32_func_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_i32_func_i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_i32_func_i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[36:39], 0 dlc
@@ -4788,8 +4832,9 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[4:7], 0
@@ -4916,8 +4961,9 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX11-NEXT:    s_mov_b32 s32, 16
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+16
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
@@ -5082,8 +5128,9 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX11-NEXT:    s_mov_b32 s32, 32
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+16
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
@@ -5292,8 +5339,9 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
@@ -5573,8 +5621,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; GFX11-NEXT:    v_mov_b32_e32 v30, s18
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, stack_passed_f64_arg at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, stack_passed_f64_arg at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, stack_passed_f64_arg at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, stack_passed_f64_arg at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -5696,8 +5745,9 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, byval_align16_f64_arg at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, byval_align16_f64_arg at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, byval_align16_f64_arg at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, byval_align16_f64_arg at rel32@hi+16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b32 off, v31, s32
 ; GFX11-NEXT:    scratch_load_b64 v[31:32], off, s32 offset:24
@@ -5786,8 +5836,9 @@ define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, do
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32
 ; GFX11-NEXT:    scratch_load_b64 v[31:32], off, s32 offset:4
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    scratch_store_b32 off, v33, s32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -6055,8 +6106,9 @@ define void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
@@ -6398,8 +6450,9 @@ define void @stack_12xv3f32() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v30, 0x41200000
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3f32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3f32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
@@ -6766,8 +6819,9 @@ define void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
@@ -7141,8 +7195,9 @@ define void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index cb89841b58f9787..dd1b0878d48c1d2 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -159,8 +159,9 @@ define amdgpu_kernel void @call_coldcc() #0 {
 ; GFX11-LABEL: call_coldcc:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, coldcc at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, coldcc at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, coldcc at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, coldcc at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
@@ -223,8 +224,9 @@ define amdgpu_kernel void @call_fastcc() #0 {
 ; GFX11-LABEL: call_fastcc:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, fastcc at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, fastcc at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, fastcc at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, fastcc at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index 8a69069d75e2c68..9215f9b23ecb46b 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -138,8 +138,9 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
 ; GFX1100-NEXT:    s_mov_b32 s32, 0
 ; GFX1100-NEXT:    s_getpc_b64 s[16:17]
-; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+8
+; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+16
 ; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1100-NEXT:    s_endpgm
 
@@ -234,8 +235,9 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1100-NEXT:    scratch_store_b32 off, v1, off offset:4 dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_getpc_b64 s[16:17]
-; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+8
+; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+16
 ; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1100-NEXT:    s_endpgm
 
@@ -393,8 +395,9 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX1100-NEXT:    s_mov_b32 s32, 0
 ; GFX1100-NEXT:    s_mov_b32 s33, 0
 ; GFX1100-NEXT:    s_getpc_b64 s[16:17]
-; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+8
+; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+16
 ; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1100-NEXT:    s_endpgm
 ; GFX1010-NEXT    s_add_u32 s12, s12, s17
@@ -511,8 +514,9 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1100-NEXT:    scratch_store_b32 off, v1, s33 offset:4 dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    s_getpc_b64 s[16:17]
-; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+8
+; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+16
 ; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1100-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 4cbd5e84871cc75..927d12260a586f9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -598,8 +598,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -651,8 +652,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -947,8 +949,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -1018,8 +1021,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1804,8 +1808,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -1868,8 +1873,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -2175,8 +2181,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -2257,8 +2264,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3054,8 +3062,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -3107,8 +3116,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3403,8 +3413,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -3474,8 +3485,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3802,8 +3814,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -3855,8 +3868,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -4151,8 +4165,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -4222,8 +4237,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -5007,8 +5023,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -5071,8 +5088,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -5378,8 +5396,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -5460,8 +5479,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index e07514b063ee481..1377e24a76de653 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -629,8 +629,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -698,8 +699,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1036,8 +1038,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -1131,8 +1134,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1828,8 +1832,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -1897,8 +1902,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -2235,8 +2241,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -2330,8 +2337,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3027,8 +3035,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -3096,8 +3105,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3434,8 +3444,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -3529,8 +3540,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index ad6edbd2c37a09d..1a1a34109340ec5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -629,8 +629,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -698,8 +699,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1036,8 +1038,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -1131,8 +1134,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1828,8 +1832,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -1897,8 +1902,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -2235,8 +2241,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -2330,8 +2337,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3027,8 +3035,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -3096,8 +3105,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3434,8 +3444,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -3529,8 +3540,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 4a00d7bc71bca8a..e7720e88317c071 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -650,8 +650,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -714,8 +715,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1021,8 +1023,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -1103,8 +1106,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -1900,8 +1904,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -1964,8 +1969,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -2271,8 +2277,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -2353,8 +2360,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3150,8 +3158,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -3214,8 +3223,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3521,8 +3531,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -3603,8 +3614,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -3942,8 +3954,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -4006,8 +4019,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -4313,8 +4327,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -4395,8 +4410,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -5191,8 +5207,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-NEXT:    s_mov_b32 s14, s10
@@ -5255,8 +5272,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -5562,8 +5580,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
@@ -5644,8 +5663,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
 ; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+8
+; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+16
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
 ; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 220ea962b9e1dca..50634733bebad9e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -14,8 +14,9 @@ define void @f0() {
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, f1 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, f1 at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, f1 at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, f1 at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_writelane_b32 v4, s30, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_writelane_b32 v4, s31, 1
@@ -81,8 +82,9 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
 ; GFX11-NEXT:    s_addc_u32 s9, s17, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+16
 ; GFX11-NEXT:    s_mov_b32 s13, s14
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, s14
@@ -179,8 +181,9 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
 ; GFX11-NEXT:    s_addc_u32 s9, s17, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+16
 ; GFX11-NEXT:    s_mov_b32 s13, s14
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s14, s15
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index 970c2c1c0456e0e..cbb31120d637c5c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -580,8 +580,9 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32
 ; GFX11-LABEL: s_buffer_load_index_across_bb:
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GFX11-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 3cb03099da93d51..c85d8efe14f80a0 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -234,8 +234,9 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX11-LABEL: clmem_read_simplified:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
@@ -812,8 +813,9 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-LABEL: clmem_read:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
@@ -1242,8 +1244,9 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX11-LABEL: Address32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
@@ -1502,8 +1505,9 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-LABEL: Offset64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
@@ -1717,8 +1721,9 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-LABEL: p32Offset64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
@@ -1968,8 +1973,9 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
 ; GFX11-LABEL: DiffBase:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_load_b128 s[36:39], s[0:1], 0x24
@@ -2285,8 +2291,9 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX11-LABEL: ReverseOrder:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
@@ -2522,8 +2529,9 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX11-LABEL: negativeoffset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, _Z13get_global_idj at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, _Z13get_global_idj at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-NEXT:    s_load_b64 s[34:35], s[0:1], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
index 25a2924bef541a9..29155f6156d1db0 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll
@@ -1,11 +1,15 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 
 ; GCN: foo1:
 ; v_cndmask_b32_e64 v0, 0, 1, vcc_lo{{$}}
 ; GCN: kernel1:
-; GCN: foo1 at gotpcrel32@lo+4
-; GCN: foo1 at gotpcrel32@hi+12
+; GCN: s_getpc_b64
+; GFX10-NEXT: foo1 at gotpcrel32@lo+4
+; GFX10-NEXT: foo1 at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu
+; GFX11-NEXT: foo1 at gotpcrel32@lo+8
+; GFX11-NEXT: foo1 at gotpcrel32@hi+16
 
 define void @foo1(i32 %x) #1 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/rel32.ll b/llvm/test/CodeGen/AMDGPU/rel32.ll
index 41bf8f4ea8434ee..ef06c08d848d0c2 100644
--- a/llvm/test/CodeGen/AMDGPU/rel32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rel32.ll
@@ -1,13 +1,16 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,GFX910
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,GFX910
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,GFX11
 
 @g = protected local_unnamed_addr addrspace(4) externally_initialized global i32 0, align 4
 
 ; CHECK-LABEL: rel32_neg_offset:
 ; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo-4
-; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+4
+; GFX910-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo-4
+; GFX910-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+4
+; GFX11-NEXT: s_delay_alu
+; GFX11-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo
+; GFX11-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+8
 define ptr addrspace(4) @rel32_neg_offset() {
   %r = getelementptr i32, ptr addrspace(4) @g, i64 -2
   ret ptr addrspace(4) %r
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index d52655a0791615d..61ced07bb2e899b 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -87,8 +87,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
 ; MUBUF11-NEXT:    v_mov_b32_e32 v4, 0x400000
 ; MUBUF11-NEXT:    s_movk_i32 s32, 0x6000
 ; MUBUF11-NEXT:    s_getpc_b64 s[0:1]
-; MUBUF11-NEXT:    s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+4
-; MUBUF11-NEXT:    s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+12
+; MUBUF11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; MUBUF11-NEXT:    s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+8
+; MUBUF11-NEXT:    s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+16
 ; MUBUF11-NEXT:    s_waitcnt lgkmcnt(0)
 ; MUBUF11-NEXT:    v_mov_b32_e32 v0, s2
 ; MUBUF11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -114,8 +115,9 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr
 ; FLATSCR11-NEXT:    v_mov_b32_e32 v4, 0x400000
 ; FLATSCR11-NEXT:    s_movk_i32 s32, 0x6000
 ; FLATSCR11-NEXT:    s_getpc_b64 s[0:1]
-; FLATSCR11-NEXT:    s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+4
-; FLATSCR11-NEXT:    s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+12
+; FLATSCR11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; FLATSCR11-NEXT:    s_add_u32 s0, s0, svm_eval_nodes at rel32@lo+8
+; FLATSCR11-NEXT:    s_addc_u32 s1, s1, svm_eval_nodes at rel32@hi+16
 ; FLATSCR11-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR11-NEXT:    v_mov_b32_e32 v0, s2
 ; FLATSCR11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 92efbe5a7182634..726c711273eb589 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -149,8 +149,9 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+16
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
@@ -316,8 +317,9 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0



More information about the llvm-commits mailing list