[llvm] [AMDGPU] Shrink S_MOV_B64 to S_MOV_B32 during rematerialization (PR #184333)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 3 04:59:14 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Romanov Vlad (romanovvlad)
<details>
<summary>Changes</summary>
When rematerializing S_MOV_B64 or S_MOV_B64_IMM_PSEUDO and only a single 32-bit lane of the result is used at the remat point, emit S_MOV_B32 with the appropriate half of the 64-bit immediate instead.
This reduces register pressure by defining a 32-bit register instead of a 64-bit pair when the other half is unused.
---
Full diff: https://github.com/llvm/llvm-project/pull/184333.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+72)
- (added) llvm/test/CodeGen/AMDGPU/regalloc-sgpr128-partial-def.mir (+65)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 91d85990ce16c..c8003e4bae9a2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2617,6 +2617,78 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
// TODO: Handle more cases.
unsigned Opcode = Orig.getOpcode();
switch (Opcode) {
+ case AMDGPU::S_MOV_B64:
+ case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
+ if (SubIdx != 0)
+ break;
+
+ if (I == MBB.end())
+ break;
+
+ if (I->isBundled())
+ break;
+
+ if (!Orig.getOperand(1).isImm())
+ break;
+
+ // Shrink S_MOV_B64 to S_MOV_B32 when the use at the insertion point
+ // only needs a single 32-bit subreg of the defined value.
+
+ // Scan all uses of the original register from the insertion point
+ // and verify that all uses in the same live range read the same lane.
+ // Stop at a def of RegToFind since that starts a new live
+ // range whose uses won't be rewritten to our DestReg.
+ Register RegToFind = Orig.getOperand(0).getReg();
+ unsigned UseSubReg = AMDGPU::NoSubRegister;
+
+ [&]() {
+ for (auto It = I; It != MBB.end(); ++It) {
+ for (auto &MO : It->operands()) {
+ // Skip irrelevant registers
+ if (!MO.isReg() || MO.getReg() != RegToFind)
+ continue;
+
+ // Stop at a new live range
+ if (MO.isDef())
+ return;
+
+ if (UseSubReg == AMDGPU::NoSubRegister) {
+ UseSubReg = MO.getSubReg();
+ continue;
+ }
+
+ // Bail out if subregs do not match between uses
+ if (MO.getSubReg() != UseSubReg) {
+ UseSubReg = AMDGPU::NoSubRegister;
+ return;
+ }
+ }
+ }
+ }();
+
+ if (UseSubReg == AMDGPU::NoSubRegister)
+ break;
+
+ if (RI.getSubRegIdxSize(UseSubReg) != 32)
+ break;
+
+ // Determine which half of the 64-bit immediate corresponds to the use.
+ unsigned UseOffset = RI.getSubRegIdxOffset(UseSubReg);
+ unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
+ unsigned DefOffset =
+ (OrigSubReg == AMDGPU::NoSubRegister)
+ ? 0
+ : RI.getSubRegIdxOffset(Orig.getOperand(0).getSubReg());
+ int64_t Imm64 = Orig.getOperand(1).getImm();
+ int32_t Imm32 = (UseOffset == DefOffset) ? Lo_32(Imm64) : Hi_32(Imm64);
+
+ // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
+ BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
+ .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
+ .addImm(Imm32);
+ return;
+ }
+
case AMDGPU::S_LOAD_DWORDX16_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM: {
if (SubIdx != 0)
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-sgpr128-partial-def.mir b/llvm/test/CodeGen/AMDGPU/regalloc-sgpr128-partial-def.mir
new file mode 100644
index 0000000000000..6409b3d864c3f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-sgpr128-partial-def.mir
@@ -0,0 +1,65 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=2 -start-before=greedy -stop-after=virtregrewriter \
+# RUN: %s -o - | FileCheck %s
+
+# Test that S_MOV_B64 rematerialization is shrunk to S_MOV_B32 when only a
+# single 32-bit subreg of the defined value is used at the remat point.
+
+# CHECK-LABEL: name: remat_shrink_s_mov_b64
+---
+name: remat_shrink_s_mov_b64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8_sgpr9
+ %0:sreg_64_xexec = COPY $sgpr8_sgpr9
+
+ ; 0x0000002A_00000539: Lo_32 = 1337, Hi_32 = 42
+ undef %1.sub0_sub1:sgpr_128 = S_MOV_B64 180388627769
+ ; 0x00000007_00000065: Lo_32 = 101, Hi_32 = 7
+ undef %2.sub0_sub1:sgpr_128 = S_MOV_B64 30064771173
+ %2.sub2_sub3:sgpr_128 = S_MOV_B64 2
+ undef %3.sub2_sub3:sgpr_128 = S_MOV_B64 30064771173
+ undef %4.sub2_sub3:sgpr_128 = S_MOV_B64 180388627769
+ undef %5.sub0_sub1:sgpr_128 = S_MOV_B64 180388627769
+ %6:sreg_64 = S_MOV_B64_IMM_PSEUDO 65536
+ S_NOP 0, implicit %1.sub0
+ S_NOP 0, implicit %1.sub1
+
+ ; CHECK: S_NOP 0, implicit-def $sgpr0_sgpr1
+ S_NOP 0, implicit-def $sgpr0_sgpr1, implicit-def $sgpr2_sgpr3, implicit-def $sgpr4_sgpr5, implicit-def $sgpr6_sgpr7
+
+ ; %2: uses both sub0 and sub1 - no shrink expected.
+ S_NOP 0, implicit %2.sub0
+ S_NOP 0, implicit %2.sub1
+
+ ; %1: both lanes used after barrier, rematerialized as S_MOV_B64, not shrunk.
+ ; CHECK: renamable $[[REG1LO:sgpr[0-9]+]]_[[REG1HI:sgpr[0-9]+]] = S_MOV_B64 180388627769
+ ; CHECK-NEXT: S_NOP 0, implicit renamable $[[REG1HI]]
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG1LO]]
+ S_NOP 0, implicit %1.sub1
+ S_NOP 0, implicit %1.sub0
+
+ ; %3: only sub3 used - shrink to S_MOV_B32 with Hi_32(0x0000000700000065) = 7.
+ ; CHECK: renamable $[[REG3:sgpr[0-9]+]] = S_MOV_B32 7
+ ; CHECK-NEXT: S_NOP 0, implicit renamable $[[REG3]]
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG3]]
+ S_NOP 0, implicit %3.sub3
+ S_NOP 0, implicit %3.sub3
+
+ ; %4: only sub2 used - shrink to S_MOV_B32 with Lo_32(0x0000002A00000539) = 1337.
+ ; CHECK: renamable $[[REG4:sgpr[0-9]+]] = S_MOV_B32 1337
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG4]]
+ S_NOP 0, implicit %4.sub2
+
+ ; %5: sub0_sub1 def, only sub1 used - shrink to S_MOV_B32 with Hi_32(0x0000002A00000539) = 42.
+ ; CHECK: renamable $[[REG5:sgpr[0-9]+]] = S_MOV_B32 42
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG5]]
+ S_NOP 0, implicit %5.sub1
+
+ ; CHECK: renamable $[[REG6:sgpr[0-9]+]] = S_MOV_B32 65536
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG6]]
+ S_NOP 0, implicit %6.sub0
+
+ ; CHECK: S_ENDPGM 0
+ S_ENDPGM 0, implicit %0, implicit %1.sub0
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/184333
More information about the llvm-commits
mailing list