[clang] [clang-tools-extra] [llvm] [AMDGPU] Work around s_getpc_b64 zero extending on GFX12 (PR #78186)

Jay Foad via cfe-commits cfe-commits at lists.llvm.org
Thu Jan 18 02:17:19 PST 2024


https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/78186

>From d3f4ebf849f6ef1ea373e5c7f93398db6681b2b6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Mon, 15 Jan 2024 15:02:08 +0000
Subject: [PATCH 1/4] Add GFX11/12 test coverage

---
 llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 103 +++++++++++++-----
 1 file changed, 77 insertions(+), 26 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 598d7a8033c2e54..2c1baeeeda21697 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -1,32 +1,83 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s
-
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
 
 define void @test_remat_s_getpc_b64() {
-; CHECK-LABEL: test_remat_s_getpc_b64:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v0, s30, 0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v0, s31, 1
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    v_mov_b32_e32 v2, s5
-; CHECK-NEXT:    global_store_dwordx2 v[1:2], v[1:2], off
-; CHECK-NEXT:    v_readlane_b32 s31, v0, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v0, 0
-; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_remat_s_getpc_b64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    global_store_dwordx2 v[1:2], v[1:2], off
+; GFX9-NEXT:    v_readlane_b32 s31, v0, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_remat_s_getpc_b64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    v_readlane_b32 s31, v0, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX11-NEXT:    global_store_b64 v[1:2], v[1:2], off
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_remat_s_getpc_b64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    v_readlane_b32 s31, v0, 1
+; GFX12-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX12-NEXT:    global_store_b64 v[1:2], v[1:2], off
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i64 @llvm.amdgcn.s.getpc()
   tail call void asm sideeffect "", "s"(i64 %0)

>From f192ec981932333d535999082727c13d569c87c1 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 2 Nov 2023 11:37:10 +0000
Subject: [PATCH 2/4] [AMDGPU] Work around s_getpc_b64 zero extending on GFX12

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  2 ++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  4 +++
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 26 +++++++++++++++++--
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  5 +++-
 .../AMDGPU/llvm.amdgcn.s.buffer.load.ll       |  5 ++--
 llvm/test/CodeGen/AMDGPU/remat-sop.mir        | 24 ++++++++---------
 llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll |  7 +++--
 8 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 93b2e0b9450bef8..4e67099fe1fd745 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1810,6 +1810,8 @@ def int_amdgcn_s_setreg :
 // not cross a 4Gb address boundary. Use for any other purpose may not
 // produce the desired results as optimizations may cause code movement,
 // especially as we explicitly use IntrNoMem to allow optimizations.
+// This intrinsic always returns PC sign-extended from 48 bits even if the
+// s_getpc_b64 instruction returns a zero-extended value.
 def int_amdgcn_s_getpc :
   ClangBuiltin<"__builtin_amdgcn_s_getpc">,
   DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 070d165cdaadb8f..ce499636ac1d365 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1259,6 +1259,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
 
+  // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
+  // of sign-extending.
+  bool hasGetPCZeroExtension() const { return GFX12Insts; }
+
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0f89df144486678..acd0c4ed7f1d5c5 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -188,7 +188,7 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         .addImm(MFI->getGITPtrHigh())
         .addReg(TargetReg, RegState::ImplicitDefine);
   } else {
-    const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
+    const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_PSEUDO);
     BuildMI(MBB, I, DL, GetPC64, TargetReg);
   }
   Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index aa98a4b860dda95..ba04b55a840b201 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2410,13 +2410,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
     // instruction.
 
+    int64_t Adjust = 0;
+    if (ST.hasGetPCZeroExtension()) {
+      // Fix up hardware that does not sign-extend the 48-bit PC value by
+      // inserting: s_sext_i32_i16 reghi, reghi
+      Bundler.append(
+          BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
+      Adjust += 4;
+    }
+
     if (OpLo.isGlobal())
-      OpLo.setOffset(OpLo.getOffset() + 4);
+      OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
     Bundler.append(
         BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
 
     if (OpHi.isGlobal())
-      OpHi.setOffset(OpHi.getOffset() + 12);
+      OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
                        .addReg(RegHi)
                        .add(OpHi));
@@ -2480,6 +2489,19 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::S_MUL_I64_I32_PSEUDO:
     MI.setDesc(get(AMDGPU::S_MUL_U64));
     break;
+
+  case AMDGPU::S_GETPC_B64_PSEUDO:
+    MI.setDesc(get(AMDGPU::S_GETPC_B64));
+    if (ST.hasGetPCZeroExtension()) {
+      Register Dst = MI.getOperand(0).getReg();
+      Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+      // Fix up hardware that does not sign-extend the 48-bit PC value by
+      // inserting: s_sext_i32_i16 dsthi, dsthi
+      BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
+              DstHi)
+          .addReg(DstHi);
+    }
+    break;
   }
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 46fa3d57a21cb2c..2fd16f82edbbcd5 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -292,8 +292,11 @@ def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>;
 def S_BITSET1_B32 : SOP1_32    <"s_bitset1_b32", [], 1>;
 def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>;
 
+def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64">;
+// PSEUDO includes a workaround for a hardware anomaly where some ASICs
+// zero-extend the result from 48 bits instead of sign-extending.
 let isReMaterializable = 1 in
-def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64",
+def S_GETPC_B64_PSEUDO : SOP1_64_0  <"s_getpc_b64",
   [(set i64:$sdst, (int_amdgcn_s_getpc))]
 >;
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
index 7dffd1a75ed0ec1..b0f689680c311c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
@@ -723,8 +723,9 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32
 ; GFX12-LABEL: s_buffer_load_index_across_bb:
 ; GFX12:       ; %bb.0: ; %main_body
 ; GFX12-NEXT:    s_getpc_b64 s[4:5]
-; GFX12-NEXT:    s_add_co_u32 s4, s4, gv at gotpcrel32@lo+4
-; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, gv at gotpcrel32@hi+12
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_add_co_u32 s4, s4, gv at gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, gv at gotpcrel32@hi+16
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
index e41c42c4f40b824..0935c61af0337cf 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
@@ -581,16 +581,16 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: test_remat_s_getpc_b64
-    ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64
+    ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_PSEUDO
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_PSEUDO
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
-    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_PSEUDO
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
     ; GCN-NEXT: S_ENDPGM 0
-    %0:sgpr_64 = S_GETPC_B64
-    %1:sgpr_64 = S_GETPC_B64
-    %2:sgpr_64 = S_GETPC_B64
+    %0:sgpr_64 = S_GETPC_B64_PSEUDO
+    %1:sgpr_64 = S_GETPC_B64_PSEUDO
+    %2:sgpr_64 = S_GETPC_B64_PSEUDO
     S_NOP 0, implicit %0
     S_NOP 0, implicit %1
     S_NOP 0, implicit %2
@@ -604,15 +604,15 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: test_remat_s_getpc_b64_2
-    ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64
+    ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_PSEUDO
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_PSEUDO
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.3, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.3, addrspace 5)
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = COPY renamable $sgpr2
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = COPY killed renamable $sgpr3
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64
+    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_PSEUDO
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.5, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.5, addrspace 5)
     ; GCN-NEXT: renamable $sgpr0 = COPY killed renamable $sgpr1
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.4, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.4, addrspace 5)
@@ -635,9 +635,9 @@ body:             |
     ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.4, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.4, addrspace 5)
     ; GCN-NEXT: dead renamable $sgpr0 = S_ADDC_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc, implicit $scc
     ; GCN-NEXT: S_ENDPGM 0
-    %0:sreg_64 = S_GETPC_B64
-    %1:sreg_64 = S_GETPC_B64
-    %2:sreg_64 = S_GETPC_B64
+    %0:sreg_64 = S_GETPC_B64_PSEUDO
+    %1:sreg_64 = S_GETPC_B64_PSEUDO
+    %2:sreg_64 = S_GETPC_B64_PSEUDO
     %4:sreg_32 = COPY %0.sub0:sreg_64
     %5:sreg_32 = COPY %0.sub1:sreg_64
     %6:sreg_32 = COPY %1.sub0:sreg_64
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 2c1baeeeda21697..84953b70c3bb94e 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -62,15 +62,18 @@ define void @test_remat_s_getpc_b64() {
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    v_writelane_b32 v0, s30, 0
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    v_writelane_b32 v0, s31, 1
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s31, v0, 1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX12-NEXT:    v_readlane_b32 s30, v0, 0
 ; GFX12-NEXT:    global_store_b64 v[1:2], v[1:2], off
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1

>From 7e14cf26f20172d9ac0578725626de94d9f06a33 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 17 Jan 2024 14:46:33 +0000
Subject: [PATCH 3/4] Lowercase _pseudo

---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp |  2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp     |  2 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td  |  2 +-
 llvm/test/CodeGen/AMDGPU/remat-sop.mir     | 24 +++++++++++-----------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index acd0c4ed7f1d5c5..a02c2a465908222 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -188,7 +188,7 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         .addImm(MFI->getGITPtrHigh())
         .addReg(TargetReg, RegState::ImplicitDefine);
   } else {
-    const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_PSEUDO);
+    const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
     BuildMI(MBB, I, DL, GetPC64, TargetReg);
   }
   Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ba04b55a840b201..b60c24d14bea436 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2490,7 +2490,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_MUL_U64));
     break;
 
-  case AMDGPU::S_GETPC_B64_PSEUDO:
+  case AMDGPU::S_GETPC_B64_pseudo:
     MI.setDesc(get(AMDGPU::S_GETPC_B64));
     if (ST.hasGetPCZeroExtension()) {
       Register Dst = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2fd16f82edbbcd5..6e08630ee07671d 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -296,7 +296,7 @@ def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64">;
 // PSEUDO includes a workaround for a hardware anomaly where some ASICs
 // zero-extend the result from 48 bits instead of sign-extending.
 let isReMaterializable = 1 in
-def S_GETPC_B64_PSEUDO : SOP1_64_0  <"s_getpc_b64",
+def S_GETPC_B64_pseudo : SOP1_64_0  <"s_getpc_b64",
   [(set i64:$sdst, (int_amdgcn_s_getpc))]
 >;
 
diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
index 0935c61af0337cf..81aa3a39de42ffc 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir
@@ -581,16 +581,16 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: test_remat_s_getpc_b64
-    ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_PSEUDO
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_PSEUDO
+    ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_pseudo
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
-    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_PSEUDO
+    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1
     ; GCN-NEXT: S_ENDPGM 0
-    %0:sgpr_64 = S_GETPC_B64_PSEUDO
-    %1:sgpr_64 = S_GETPC_B64_PSEUDO
-    %2:sgpr_64 = S_GETPC_B64_PSEUDO
+    %0:sgpr_64 = S_GETPC_B64_pseudo
+    %1:sgpr_64 = S_GETPC_B64_pseudo
+    %2:sgpr_64 = S_GETPC_B64_pseudo
     S_NOP 0, implicit %0
     S_NOP 0, implicit %1
     S_NOP 0, implicit %2
@@ -604,15 +604,15 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: test_remat_s_getpc_b64_2
-    ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_PSEUDO
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_PSEUDO
+    ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_pseudo
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.3, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.3, addrspace 5)
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = COPY renamable $sgpr2
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = COPY killed renamable $sgpr3
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_PSEUDO
+    ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.5, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.5, addrspace 5)
     ; GCN-NEXT: renamable $sgpr0 = COPY killed renamable $sgpr1
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.4, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.4, addrspace 5)
@@ -635,9 +635,9 @@ body:             |
     ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.4, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.4, addrspace 5)
     ; GCN-NEXT: dead renamable $sgpr0 = S_ADDC_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc, implicit $scc
     ; GCN-NEXT: S_ENDPGM 0
-    %0:sreg_64 = S_GETPC_B64_PSEUDO
-    %1:sreg_64 = S_GETPC_B64_PSEUDO
-    %2:sreg_64 = S_GETPC_B64_PSEUDO
+    %0:sreg_64 = S_GETPC_B64_pseudo
+    %1:sreg_64 = S_GETPC_B64_pseudo
+    %2:sreg_64 = S_GETPC_B64_pseudo
     %4:sreg_32 = COPY %0.sub0:sreg_64
     %5:sreg_32 = COPY %0.sub1:sreg_64
     %6:sreg_32 = COPY %1.sub0:sreg_64

>From 8f46084ed3a306d239e649f6c149b0ffb46f0fe6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 17 Jan 2024 15:41:43 +0000
Subject: [PATCH 4/4] Documentation

---
 llvm/docs/AMDGPUUsage.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e05f7fc3e766275..13dc973ec104bf7 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1182,6 +1182,10 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
                                                    The iglp_opt strategy implementations are subject to change.
 
+  llvm.amdgcn.s.getpc                              Provides access to the s_getpc_b64 instruction, but with the return value
+                                                   sign-extended from the width of the underlying PC hardware register even on
+                                                   processors where the s_getpc_b64 instruction returns a zero-extended value.
+
   ==============================================   ==========================================================
 
 .. TODO::



More information about the cfe-commits mailing list