[llvm] [AMDGPU] Emit b32 movs if (a)v_mov_b64_pseudo dest vgprs are misaligned (PR #160547)
Janek van Oirschot via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 30 05:28:45 PDT 2025
https://github.com/JanekvO updated https://github.com/llvm/llvm-project/pull/160547
>From b93b9551be594f9bf082dc2e4ea830ee0e5414d7 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Wed, 24 Sep 2025 16:26:08 +0100
Subject: [PATCH 1/5] [AMDGPU] Emit separate v_mov_b32s if v_mov_b64_pseudo
destination vgprs are misaligned
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 ++--
.../AMDGPU/misaligned-vgpr-regsequence.mir | 33 +++++++++++++++++++
llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir | 31 +++++++++++++++++
3 files changed, 69 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 84886d7780888..76a1cce98c75f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2149,7 +2149,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
assert(!SrcOp.isFPImm());
- if (ST.hasMovB64()) {
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Dst);
+ if (ST.hasMovB64() && RI.isProperlyAlignedRC(*RC)) {
MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
@@ -2159,7 +2161,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
- if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
+ if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
+ RI.isProperlyAlignedRC(*RC)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
diff --git a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
new file mode 100644
index 0000000000000..a42a74597a1e9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
@@ -0,0 +1,33 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-after=si-load-store-opt %s -o - | FileCheck %s
+
+# CHECK: "misaligned-regsequence":
+# CHECK: ; %bb.0:
+# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
+# CHECK: v_mov_b32_e32 v5, 0
+# CHECK: v_mov_b32_e32 v4, 0
+# CHECK: v_mov_b32_e32 v6, 0
+# CHECK: s_waitcnt lgkmcnt(0)
+# CHECK: v_mov_b64_e32 v[2:3], s[0:1]
+# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
+# CHECK: s_endpgm
+
+--- |
+ define void @misaligned-regsequence() { ret void }
+...
+---
+name: misaligned-regsequence
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr4_sgpr5
+
+ %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3:sgpr_64(p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+ %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %10:vreg_64_align2 = COPY %8:sreg_64_xexec
+ %11:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+ %13:vreg_96_align2 = REG_SEQUENCE killed %9:vgpr_32, %subreg.sub0, killed %11:vreg_64_align2, %subreg.sub1_sub2
+ FLAT_STORE_DWORDX3 %10:vreg_64_align2, killed %13:vreg_96_align2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4)
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir b/llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir
new file mode 100644
index 0000000000000..672a52a0e4bd3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir
@@ -0,0 +1,31 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=postrapseudos %s -o - | FileCheck %s
+
+# CHECK: v_mov_b64_misalign:
+# CHECK: ; %bb.0:
+# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
+# CHECK: v_mov_b32_e32 v5, 0
+# CHECK: v_mov_b32_e32 v4, 0
+# CHECK: v_mov_b32_e32 v6, 0
+# CHECK: s_waitcnt lgkmcnt(0)
+# CHECK: v_mov_b64_e32 v[2:3], s[0:1]
+# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
+# CHECK: s_endpgm
+
+---
+name: v_mov_b64_misalign
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+
+ frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
+ frame-setup CFI_INSTRUCTION undefined $pc_reg
+ renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+ renamable $vgpr4 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ renamable $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ renamable $vgpr2_vgpr3 = COPY killed renamable $sgpr0_sgpr1, implicit $exec
+ FLAT_STORE_DWORDX3 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5_vgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4)
+ S_ENDPGM 0
+...
+
>From 6d0f6722f4142c9b6b077477f265adce1435caf1 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Fri, 26 Sep 2025 15:51:46 +0100
Subject: [PATCH 2/5] Fix test cases, test register compatibility with regclass
used
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 38 +++++++++++--------
.../AMDGPU/misaligned-vgpr-regsequence.mir | 6 +--
.../CodeGen/AMDGPU/v_mov_b64_expansion.mir | 9 +++++
llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir | 31 ---------------
4 files changed, 35 insertions(+), 49 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 76a1cce98c75f..f6e760913139c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2146,13 +2146,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ auto *TRI = MRI.getTargetRegisterInfo();
+ const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
+ const TargetRegisterClass *Mov64RC =
+ getRegClass(Mov64Desc, /*OpNum=*/0, TRI);
+
const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
assert(!SrcOp.isFPImm());
- MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
- const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Dst);
- if (ST.hasMovB64() && RI.isProperlyAlignedRC(*RC)) {
- MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
+ if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
+ MI.setDesc(Mov64Desc);
if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
break;
@@ -2161,18 +2165,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
+ const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
+ const TargetRegisterClass *PkMovRC =
+ getRegClass(PkMovDesc, /*OpNum=*/0, TRI);
+
if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
- RI.isProperlyAlignedRC(*RC)) {
- BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
- .addImm(SISrcMods::OP_SEL_1)
- .addImm(Lo.getSExtValue())
- .addImm(SISrcMods::OP_SEL_1)
- .addImm(Lo.getSExtValue())
- .addImm(0) // op_sel_lo
- .addImm(0) // op_sel_hi
- .addImm(0) // neg_lo
- .addImm(0) // neg_hi
- .addImm(0); // clamp
+ PkMovRC->contains(Dst)) {
+ BuildMI(MBB, MI, DL, PkMovDesc, Dst)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addImm(Lo.getSExtValue())
+ .addImm(SISrcMods::OP_SEL_1)
+ .addImm(Lo.getSExtValue())
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0); // clamp
} else {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
.addImm(Lo.getSExtValue())
diff --git a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
index a42a74597a1e9..a968d61fee77e 100644
--- a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
+++ b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
@@ -1,6 +1,6 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-after=si-load-store-opt %s -o - | FileCheck %s
-# CHECK: "misaligned-regsequence":
+# CHECK: misaligned_regsequence:
# CHECK: ; %bb.0:
# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -13,10 +13,10 @@
# CHECK: s_endpgm
--- |
- define void @misaligned-regsequence() { ret void }
+ define void @misaligned_regsequence() { ret void }
...
---
-name: misaligned-regsequence
+name: misaligned_regsequence
tracksRegLiveness: true
body: |
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
index 70e2987454192..fffda4dd0a55f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -93,3 +93,12 @@ body: |
bb.0:
$vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec
...
+
+# GCN-LABEL: name: v_mov_b64_misalign
+# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+name: v_mov_b64_misalign
+body: |
+ bb.0:
+ $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir b/llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir
deleted file mode 100644
index 672a52a0e4bd3..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mov64-align.mir
+++ /dev/null
@@ -1,31 +0,0 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=postrapseudos %s -o - | FileCheck %s
-
-# CHECK: v_mov_b64_misalign:
-# CHECK: ; %bb.0:
-# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-# CHECK: s_load_dwordx2 s[0:1], s[4:5], 0x0
-# CHECK: v_mov_b32_e32 v5, 0
-# CHECK: v_mov_b32_e32 v4, 0
-# CHECK: v_mov_b32_e32 v6, 0
-# CHECK: s_waitcnt lgkmcnt(0)
-# CHECK: v_mov_b64_e32 v[2:3], s[0:1]
-# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
-# CHECK: s_endpgm
-
----
-name: v_mov_b64_misalign
-tracksRegLiveness: true
-body: |
- bb.0.entry:
- liveins: $sgpr4_sgpr5
-
- frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
- frame-setup CFI_INSTRUCTION undefined $pc_reg
- renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
- renamable $vgpr4 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
- renamable $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
- renamable $vgpr2_vgpr3 = COPY killed renamable $sgpr0_sgpr1, implicit $exec
- FLAT_STORE_DWORDX3 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5_vgpr6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4)
- S_ENDPGM 0
-...
-
>From d83de848dfbcd704249b079443c3fa24e4f01862 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Fri, 26 Sep 2025 16:54:00 +0100
Subject: [PATCH 3/5] Feedback
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
.../AMDGPU/av_movimm_pseudo_expansion.mir | 20 +++++++++++++++++++
.../CodeGen/AMDGPU/v_mov_b64_expansion.mir | 2 +-
3 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f6e760913139c..53f09c7baca2a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2146,7 +2146,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
- MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
auto *TRI = MRI.getTargetRegisterInfo();
const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
const TargetRegisterClass *Mov64RC =
diff --git a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
index c52347b680371..d08185a9e0ccd 100644
--- a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
@@ -208,3 +208,23 @@ body: |
; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec, implicit-def $vgpr1_vgpr2
$vgpr1_vgpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
...
+
+---
+name: av_mov_b64_misalign_vgpr
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: av_mov_b64_misalign_vgpr
+ ; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+ ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+ $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+...
+
+---
+name: av_mov_b64_misalign_agpr
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: av_mov_b64_misalign_agpr
+ ; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
+ ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
+ $agpr5_agpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
index fffda4dd0a55f..4c68c4519302a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -100,5 +100,5 @@ body: |
name: v_mov_b64_misalign
body: |
bb.0:
- $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ $vgpr5_vgpr6 = V_MOV_B64_PSEUDO 0, implicit $exec
...
>From 1eeadc2b111dde08e7a168c5c50f4645b4fbd962 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Mon, 29 Sep 2025 11:56:26 +0100
Subject: [PATCH 4/5] Feedback, correct tests
---
.../AMDGPU/misaligned-vgpr-regsequence.mir | 3 ---
.../siloadstoreopt-misaligned-regsequence.ll | 21 +++++++++++++++++++
.../CodeGen/AMDGPU/v_mov_b64_expansion.mir | 9 --------
3 files changed, 21 insertions(+), 12 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
diff --git a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
index a968d61fee77e..c55c8bb4c506e 100644
--- a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
+++ b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
@@ -12,9 +12,6 @@
# CHECK: flat_store_dwordx3 v[2:3], v[4:6]
# CHECK: s_endpgm
---- |
- define void @misaligned_regsequence() { ret void }
-...
---
name: misaligned_regsequence
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
new file mode 100644
index 0000000000000..41266884e11e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+define amdgpu_kernel void @foo(ptr %0) {
+; CHECK-LABEL: foo:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: v_mov_b32_e32 v4, v3
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[2:4]
+; CHECK-NEXT: s_endpgm
+entry:
+ %1 = getelementptr i8, ptr %0, i64 4
+ store i32 0, ptr %0, align 4
+ store i64 0, ptr %1, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
index 4c68c4519302a..70e2987454192 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -93,12 +93,3 @@ body: |
bb.0:
$vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec
...
-
-# GCN-LABEL: name: v_mov_b64_misalign
-# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
-# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
-name: v_mov_b64_misalign
-body: |
- bb.0:
- $vgpr5_vgpr6 = V_MOV_B64_PSEUDO 0, implicit $exec
-...
>From 794267516b0af5d5a859e2e613ccef7fccced106 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Tue, 30 Sep 2025 13:28:23 +0100
Subject: [PATCH 5/5] Allow V_MOV_B64_PSEUDO unaligned dst registers
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 ++++--
llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir | 3 ++-
llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir | 9 +++++++++
3 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 53f09c7baca2a..9aaefa84e16ad 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5050,7 +5050,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// aligned register constraint.
// FIXME: We do not verify inline asm operands, but custom inline asm
// verification is broken anyway
- if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
+ if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
+ Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
if (const TargetRegisterClass *SubRC =
@@ -6003,7 +6004,8 @@ SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
return nullptr;
auto RegClass = TID.operands()[OpNum].RegClass;
// Special pseudos have no alignment requirement.
- if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO || isSpill(TID))
+ if (TID.getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO ||
+ TID.getOpcode() == AMDGPU::V_MOV_B64_PSEUDO || isSpill(TID))
return RI.getRegClass(RegClass);
return adjustAllocatableRegClass(ST, RI, TID, RegClass);
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
index 21455a9f5074f..cd2a8de03b813 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@@ -425,7 +425,7 @@ body: |
bb.0:
; GCN-LABEL: name: fold_v_mov_b64_64_to_unaligned
; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
- ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
%0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
%1:vreg_64 = COPY killed %0
@@ -438,6 +438,7 @@ body: |
bb.0:
; GCN-LABEL: name: fold_v_mov_b64_pseudo_64_to_unaligned
; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
%0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
%1:vreg_64 = COPY killed %0
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
index 70e2987454192..4c68c4519302a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -93,3 +93,12 @@ body: |
bb.0:
$vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec
...
+
+# GCN-LABEL: name: v_mov_b64_misalign
+# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+name: v_mov_b64_misalign
+body: |
+ bb.0:
+ $vgpr5_vgpr6 = V_MOV_B64_PSEUDO 0, implicit $exec
+...
More information about the llvm-commits
mailing list