[llvm] [AMDGPU] Shrink S_MOV_B64 to S_MOV_B32 during rematerialization (PR #184333)

Tue Mar 3 04:59:14 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Romanov Vlad (romanovvlad)

<details>
<summary>Changes</summary>

When rematerializing S_MOV_B64 or S_MOV_B64_IMM_PSEUDO and only a single 32-bit lane of the result is used at the remat point, emit S_MOV_B32 with the appropriate half of the 64-bit immediate instead.

This reduces register pressure by defining a 32-bit register instead of a 64-bit pair when the other half is unused.

---
Full diff: https://github.com/llvm/llvm-project/pull/184333.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+72) 
- (added) llvm/test/CodeGen/AMDGPU/regalloc-sgpr128-partial-def.mir (+65) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 91d85990ce16c..c8003e4bae9a2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2617,6 +2617,78 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
   // TODO: Handle more cases.
   unsigned Opcode = Orig.getOpcode();
   switch (Opcode) {
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
+    if (SubIdx != 0)
+      break;
+
+    if (I == MBB.end())
+      break;
+
+    if (I->isBundled())
+      break;
+
+    if (!Orig.getOperand(1).isImm())
+      break;
+
+    // Shrink S_MOV_B64 to S_MOV_B32 when the use at the insertion point
+    // only needs a single 32-bit subreg of the defined value.
+
+    // Scan all uses of the original register from the insertion point
+    // and verify that all uses in the same live range read the same lane.
+    // Stop at a def of RegToFind since that starts a new live
+    // range whose uses won't be rewritten to our DestReg.
+    Register RegToFind = Orig.getOperand(0).getReg();
+    unsigned UseSubReg = AMDGPU::NoSubRegister;
+
+    [&]() {
+      for (auto It = I; It != MBB.end(); ++It) {
+        for (auto &MO : It->operands()) {
+          // Skip irrelevant registers
+          if (!MO.isReg() || MO.getReg() != RegToFind)
+            continue;
+
+          // Stop at a new live range
+          if (MO.isDef())
+            return;
+
+          if (UseSubReg == AMDGPU::NoSubRegister) {
+            UseSubReg = MO.getSubReg();
+            continue;
+          }
+
+          // Bail out if subregs do not match between uses
+          if (MO.getSubReg() != UseSubReg) {
+            UseSubReg = AMDGPU::NoSubRegister;
+            return;
+          }
+        }
+      }
+    }();
+
+    if (UseSubReg == AMDGPU::NoSubRegister)
+      break;
+
+    if (RI.getSubRegIdxSize(UseSubReg) != 32)
+      break;
+
+    // Determine which half of the 64-bit immediate corresponds to the use.
+    unsigned UseOffset = RI.getSubRegIdxOffset(UseSubReg);
+    unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
+    unsigned DefOffset =
+        (OrigSubReg == AMDGPU::NoSubRegister)
+            ? 0
+            : RI.getSubRegIdxOffset(Orig.getOperand(0).getSubReg());
+    int64_t Imm64 = Orig.getOperand(1).getImm();
+    int32_t Imm32 = (UseOffset == DefOffset) ? Lo_32(Imm64) : Hi_32(Imm64);
+
+    // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
+    BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
+        .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
+        .addImm(Imm32);
+    return;
+  }
+
   case AMDGPU::S_LOAD_DWORDX16_IMM:
   case AMDGPU::S_LOAD_DWORDX8_IMM: {
     if (SubIdx != 0)
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-sgpr128-partial-def.mir b/llvm/test/CodeGen/AMDGPU/regalloc-sgpr128-partial-def.mir
new file mode 100644
index 0000000000000..6409b3d864c3f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-sgpr128-partial-def.mir
@@ -0,0 +1,65 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=2 -start-before=greedy -stop-after=virtregrewriter \
+# RUN:   %s -o - | FileCheck %s
+
+# Test that S_MOV_B64 rematerialization is shrunk to S_MOV_B32 when only a
+# single 32-bit subreg of the defined value is used at the remat point.
+
+# CHECK-LABEL: name: remat_shrink_s_mov_b64
+---
+name: remat_shrink_s_mov_b64
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr8_sgpr9
+    %0:sreg_64_xexec = COPY $sgpr8_sgpr9
+
+    ; 0x0000002A_00000539: Lo_32 = 1337, Hi_32 = 42
+    undef %1.sub0_sub1:sgpr_128 = S_MOV_B64 180388627769
+    ; 0x00000007_00000065: Lo_32 = 101, Hi_32 = 7
+    undef %2.sub0_sub1:sgpr_128 = S_MOV_B64 30064771173
+    %2.sub2_sub3:sgpr_128 = S_MOV_B64 2
+    undef %3.sub2_sub3:sgpr_128 = S_MOV_B64 30064771173
+    undef %4.sub2_sub3:sgpr_128 = S_MOV_B64 180388627769
+    undef %5.sub0_sub1:sgpr_128 = S_MOV_B64 180388627769
+    %6:sreg_64 = S_MOV_B64_IMM_PSEUDO 65536
+    S_NOP 0, implicit %1.sub0
+    S_NOP 0, implicit %1.sub1
+
+    ; CHECK: S_NOP 0, implicit-def $sgpr0_sgpr1
+    S_NOP 0, implicit-def $sgpr0_sgpr1, implicit-def $sgpr2_sgpr3, implicit-def $sgpr4_sgpr5, implicit-def $sgpr6_sgpr7
+
+    ; %2: uses both sub0 and sub1 - no shrink expected.
+    S_NOP 0, implicit %2.sub0
+    S_NOP 0, implicit %2.sub1
+
+    ; %1: both lanes used after barrier, rematerialized as S_MOV_B64, not shrunk.
+    ; CHECK: renamable $[[REG1LO:sgpr[0-9]+]]_[[REG1HI:sgpr[0-9]+]] = S_MOV_B64 180388627769
+    ; CHECK-NEXT: S_NOP 0, implicit renamable $[[REG1HI]]
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG1LO]]
+    S_NOP 0, implicit %1.sub1
+    S_NOP 0, implicit %1.sub0
+
+    ; %3: only sub3 used - shrink to S_MOV_B32 with Hi_32(0x0000000700000065) = 7.
+    ; CHECK: renamable $[[REG3:sgpr[0-9]+]] = S_MOV_B32 7
+    ; CHECK-NEXT: S_NOP 0, implicit renamable $[[REG3]]
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG3]]
+    S_NOP 0, implicit %3.sub3
+    S_NOP 0, implicit %3.sub3
+
+    ; %4: only sub2 used - shrink to S_MOV_B32 with Lo_32(0x0000002A00000539) = 1337.
+    ; CHECK: renamable $[[REG4:sgpr[0-9]+]] = S_MOV_B32 1337
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG4]]
+    S_NOP 0, implicit %4.sub2
+
+    ; %5: sub0_sub1 def, only sub1 used - shrink to S_MOV_B32 with Hi_32(0x0000002A00000539) = 42.
+    ; CHECK: renamable $[[REG5:sgpr[0-9]+]] = S_MOV_B32 42
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG5]]
+    S_NOP 0, implicit %5.sub1
+
+    ; CHECK: renamable $[[REG6:sgpr[0-9]+]] = S_MOV_B32 65536
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $[[REG6]]
+    S_NOP 0, implicit %6.sub0
+
+    ; CHECK: S_ENDPGM 0
+    S_ENDPGM 0, implicit %0, implicit %1.sub0
+...

``````````

</details>


https://github.com/llvm/llvm-project/pull/184333