[llvm] [AMDGPU] SIFoldOperands: Delay foldCopyToVGPROfScalarAddOfFrameIndex (PR #141558)
Fabian Ritter via llvm-commits
llvm-commits at lists.llvm.org
Tue May 27 00:55:22 PDT 2025
https://github.com/ritter-x2a created https://github.com/llvm/llvm-project/pull/141558
foldCopyToVGPROfScalarAddOfFrameIndex transforms s_adds whose results are copied
to vector registers into v_adds. We don't want to do that if foldInstOperand
(which so far runs later) can fold the sreg->vreg copy away.
This patch therefore delays foldCopyToVGPROfScalarAddOfFrameIndex until after
foldInstOperand.
This avoids unnecessary movs in the flat-scratch-svs.ll test and also avoids
regressions in an upcoming patch to enable ISD::PTRADD nodes.
>From 198e9da6a12ff40e79e7aa212fcfe8e2bd2ef408 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Tue, 27 May 2025 03:13:23 -0400
Subject: [PATCH] [AMDGPU] SIFoldOperands: Delay
foldCopyToVGPROfScalarAddOfFrameIndex
foldCopyToVGPROfScalarAddOfFrameIndex transforms s_adds whose results are copied
to vector registers into v_adds. We don't want to do that if foldInstOperand
(which so far runs later) can fold the sreg->vreg copy away.
This patch therefore delays foldCopyToVGPROfScalarAddOfFrameIndex until after
foldInstOperand.
This avoids unnecessary movs in the flat-scratch-svs.ll test and also avoids
regressions in an upcoming patch to enable ISD::PTRADD nodes.
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 13 +++++++----
llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 24 ++++++++------------
2 files changed, 17 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 8b12eeba15618..a28c6e2811abe 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1909,10 +1909,6 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (!DstReg.isVirtual())
return false;
- if (OpToFold.isReg() &&
- foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
- return true;
-
// Fold copy to AGPR through reg_sequence
// TODO: Handle with subregister extract
if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
@@ -1947,7 +1943,14 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
Changed = true;
}
- return Changed;
+ if (Changed)
+ return true;
+
+ // Run this after foldInstOperand to avoid turning scalar additions into
+ // vector additions when the result scalar result could just be folded into
+ // the user(s).
+ return (OpToFold.isReg() &&
+ foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI));
}
// Clamp patterns are canonically selected to v_max_* instructions, so only
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index bcd5d1e87954f..a98df5c97293c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -182,8 +182,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -356,8 +355,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -701,14 +699,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
@@ -884,14 +881,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
@@ -1239,14 +1235,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
@@ -1425,8 +1420,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v3
+; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX942-SDAG-NEXT: scratch_store_byte v0, v2, off offset:2 sc0 sc1
More information about the llvm-commits
mailing list