[llvm] [AMDGPU] Emit b32 movs if (a)v_mov_b64_pseudo dest vgprs are misaligned (PR #160547)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 24 08:52:50 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Janek van Oirschot (JanekvO)
<details>
<summary>Changes</summary>
#<!-- -->154115 Exposed a possible destination misaligned v_mov_b64
si-load-store-opt would emit a REG_SEQUENCE with a b64 register pair after b32 register, resulting in a misaligned vgpr pair. machine-cp would then allow the misaligned vgpr pair to be copy-propagated a V_MOV_B64_PSEUDO which required align2. This patch ensures that the b64 v_mov pseudo instruction will check for correct vgpr alignment.
---
Full diff: https://github.com/llvm/llvm-project/pull/160547.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+5-2)
- (added) llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir (+33)
- (added) llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir (+31)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 84886d7780888..76a1cce98c75f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2149,7 +2149,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
assert(!SrcOp.isFPImm());
- if (ST.hasMovB64()) {
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Dst);
+ if (ST.hasMovB64() && RI.isProperlyAlignedRC(*RC)) {
MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
@@ -2159,7 +2161,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
- if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
+ if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
+ RI.isProperlyAlignedRC(*RC)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
diff --git a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
new file mode 100644
index 0000000000000..a42a74597a1e9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
@@ -0,0 +1,33 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-after=si-load-store-opt %s -o - | FileCheck %s
+
+# CHECK: "misaligned-regsequence":
+# CHECK: ; %bb.0:
+# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
+# CHECK: v_mov_b32_e32 v5, 0
+# CHECK: v_mov_b32_e32 v4, 0
+# CHECK: v_mov_b32_e32 v6, 0
+# CHECK: s_waitcnt lgkmcnt(0)
+# CHECK: v_mov_b64_e32 v[2:3], s[0:1]
+# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
+# CHECK: s_endpgm
+
+--- |
+ define void @misaligned-regsequence() { ret void }
+...
+---
+name: misaligned-regsequence
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr4_sgpr5
+
+ %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3:sgpr_64(p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+ %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %10:vreg_64_align2 = COPY %8:sreg_64_xexec
+ %11:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+ %13:vreg_96_align2 = REG_SEQUENCE killed %9:vgpr_32, %subreg.sub0, killed %11:vreg_64_align2, %subreg.sub1_sub2
+ FLAT_STORE_DWORDX3 %10:vreg_64_align2, killed %13:vreg_96_align2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4)
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir b/llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir
new file mode 100644
index 0000000000000..672a52a0e4bd3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir
@@ -0,0 +1,31 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=postrapseudos %s -o - | FileCheck %s
+
+# CHECK: v_mov_b64_misalign:
+# CHECK: ; %bb.0:
+# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
+# CHECK: v_mov_b32_e32 v5, 0
+# CHECK: v_mov_b32_e32 v4, 0
+# CHECK: v_mov_b32_e32 v6, 0
+# CHECK: s_waitcnt lgkmcnt(0)
+# CHECK: v_mov_b64_e32 v[2:3], s[0:1]
+# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
+# CHECK: s_endpgm
+
+---
+name: v_mov_b64_misalign
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+
+ frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
+ frame-setup CFI_INSTRUCTION undefined $pc_reg
+ renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+ renamable $vgpr4 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ renamable $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ renamable $vgpr2_vgpr3 = COPY killed renamable $sgpr0_sgpr1, implicit $exec
+ FLAT_STORE_DWORDX3 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5_vgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4)
+ S_ENDPGM 0
+...
+
``````````
</details>
https://github.com/llvm/llvm-project/pull/160547
More information about the llvm-commits
mailing list