[llvm] [AMDGPU] siloadstoreopt generate REG_SEQUENCE with aligned operands (PR #162088)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 6 06:43:05 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Janek van Oirschot (JanekvO)
<details>
<summary>Changes</summary>
Stop generating misaligned operands (misaligned compared to the dst alignment)
---
Full diff: https://github.com/llvm/llvm-project/pull/162088.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (+27-6)
- (modified) llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f0d1117664983..1185fbf325932 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1363,6 +1363,15 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
return Where;
}
+static bool isCompatibleAlignSubReg(const TargetRegisterClass *RC, unsigned Idx,
+ const GCNSubtarget *STM,
+ const SIRegisterInfo *TRI) {
+ if (!STM->needsAlignedVGPRs() || !TRI->isVGPRClass(RC) ||
+ !TRI->isProperlyAlignedRC(*RC) || AMDGPU::getRegBitWidth(*RC) == 32)
+ return true;
+ return !(TRI->getChannelFromSubReg(Idx) & 0x1);
+}
+
// Copy the merged load result from DestReg to the original dest regs of CI and
// Paired.
void SILoadStoreOptimizer::copyToDestRegs(
@@ -1411,12 +1420,24 @@ SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
- BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
- .add(*Src0)
- .addImm(SubRegIdx0)
- .add(*Src1)
- .addImm(SubRegIdx1);
-
+ // Make sure the generated REG_SEQUENCE has sensibly aligned registers.
+ if (isCompatibleAlignSubReg(Paired.DataRC, SubRegIdx1, STM, TRI)) {
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ .add(*Src0)
+ .addImm(SubRegIdx0)
+ .add(*Src1)
+ .addImm(SubRegIdx1);
+ } else {
+ auto BMI =
+ BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ .add(*Src0)
+ .addImm(SubRegIdx0);
+ for (unsigned i = 0; i < Paired.Width; ++i) {
+ unsigned PreviousChannel = TRI->getChannelFromSubReg(SubRegIdx0);
+ BMI.addReg(Src1->getReg(), /*flags=*/0, TRI->getSubRegFromChannel(i))
+ .addImm(TRI->getSubRegFromChannel(PreviousChannel + i + 1));
+ }
+ }
return SrcReg;
}
diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir
index a67cf22bdd1ce..b45b69010141e 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir
@@ -213,7 +213,7 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]].sub0, %subreg.sub1, [[DEF2]].sub1, %subreg.sub2
; GCN-NEXT: FLAT_STORE_DWORDX3 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr poison`, align 4)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
@@ -230,7 +230,7 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2_sub3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]].sub0, %subreg.sub1, [[DEF2]].sub1, %subreg.sub2, [[DEF2]].sub2, %subreg.sub3
; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 4)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
``````````
</details>
https://github.com/llvm/llvm-project/pull/162088
More information about the llvm-commits
mailing list