[llvm] [AMDGPU] Allow folding of non-subregs through REG_SEQUENCE (PR #151033)
Josh Hutton via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 7 11:39:46 PST 2025
https://github.com/JoshHuttonCode updated https://github.com/llvm/llvm-project/pull/151033
>From 331902ad946143f0dc49ba3b7d74bb3aca289790 Mon Sep 17 00:00:00 2001
From: Josh Hutton <joshhuttonemail at gmail.com>
Date: Thu, 6 Nov 2025 14:47:32 -0800
Subject: [PATCH] [AMDGPU] Allow folding of non-subregs through REG_SEQUENCE
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 44 +-
.../fold-non-subregs-through-regsequences.mir | 390 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/fold-readlane.mir | 2 -
.../CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll | 17 +
4 files changed, 444 insertions(+), 9 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fold-non-subregs-through-regsequences.mir
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 84984a0871dac..fdc55a4ef62e6 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -730,14 +730,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
}
- // Rework once the VS_16 register class is updated to include proper
- // 16-bit SGPRs instead of 32-bit ones.
- if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
- Old.setSubReg(AMDGPU::NoSubRegister);
+ Old.setSubReg(New->getSubReg());
if (New->getReg().isPhysical()) {
Old.substPhysReg(New->getReg(), *TRI);
} else {
- Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
+ Old.substVirtReg(New->getReg(), 0, *TRI);
Old.setIsUndef(New->isUndef());
}
return true;
@@ -1150,10 +1147,14 @@ void SIFoldOperandsImpl::foldOperand(
if (UseOp->isReg() && OpToFold.isReg()) {
if (UseOp->isImplicit())
return;
- // Allow folding from SGPRs to 16-bit VGPRs.
+
+ MachineInstr *SourceInstruction = MRI->getVRegDef(UseOp->getReg());
+ // Allow folding from SGPRs to 16-bit VGPRs
+ // or folding of non-subregs through REG_SEQUENCES.
if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
(UseOp->getSubReg() != AMDGPU::lo16 ||
- !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
+ !TRI->isSGPRReg(*MRI, OpToFold.getReg())) &&
+ !SourceInstruction->isRegSequence())
return;
}
@@ -1452,6 +1453,35 @@ void SIFoldOperandsImpl::foldOperand(
return;
}
+ // FIXME: If we properly encode the 32-bit aligned register requirement for
+ // these DS_GWS instructions, this can be removed.
+ if (!FoldingImmLike && OpToFold.isReg() && ST->needsAlignedVGPRs()) {
+ unsigned Opc = UseMI->getOpcode();
+ // Special case for DS_GWS instructions that only use 32 bits but hardware
+ // treats it as a 64 bit read.
+ if (Opc == AMDGPU::DS_GWS_INIT || Opc == AMDGPU::DS_GWS_SEMA_BR ||
+ Opc == AMDGPU::DS_GWS_BARRIER) {
+ const TargetRegisterClass *RC =
+ TRI->getRegClassForReg(*MRI, OpToFold.getReg());
+ assert(RC);
+
+ const auto isAlignedReg = [&OpToFold, &UseOp, &UseMI, &RC,
+ this](AMDGPU::OpName OpName) -> bool {
+ const MachineOperand *Op = TII->getNamedOperand(*UseMI, OpName);
+ if (Op != UseOp)
+ return true;
+ Register Reg = OpToFold.getReg();
+ assert(!Reg.isPhysical());
+ return TRI->getRegSizeInBits(*RC) > 32 &&
+ !(TRI->getChannelFromSubReg(OpToFold.getSubReg()) & 1) &&
+ TRI->isProperlyAlignedRC(*RC);
+ };
+
+ if (!isAlignedReg(AMDGPU::OpName::data0))
+ return;
+ }
+ }
+
// FIXME: We could try to change the instruction from 64-bit to 32-bit
// to enable more folding opportunities. The shrink operands pass
// already does this.
diff --git a/llvm/test/CodeGen/AMDGPU/fold-non-subregs-through-regsequences.mir b/llvm/test/CodeGen/AMDGPU/fold-non-subregs-through-regsequences.mir
new file mode 100644
index 0000000000000..94038f17950cd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-non-subregs-through-regsequences.mir
@@ -0,0 +1,390 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=si-fold-operands -o - %s | FileCheck %s
+---
+name: v_readfirstlane_b32_omitted
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_omitted
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:sgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = COPY %1
+ %4:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %3, %subreg.sub1
+ %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec
+ %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec
+ %7:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1
+ %8:sreg_64 = S_QUADMASK_B64 %7, implicit-def $scc
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_omitted_switched_subregs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_omitted_switched_subregs
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub1, killed [[COPY1]], %subreg.sub0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:sgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = COPY %1
+ %4:vreg_64 = REG_SEQUENCE %0, %subreg.sub1, killed %3, %subreg.sub0
+ %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec
+ %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec
+ %7:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1
+ %8:sreg_64 = S_QUADMASK_B64 killed %7, implicit-def $scc
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_phys_vgpr_and_sgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_phys_vgpr_and_sgpr
+ ; CHECK: liveins: $vgpr0, $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_32 = COPY $sgpr0
+ %2:sgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = COPY %1, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %3, %subreg.sub1
+ %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec
+ %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec
+ %7:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %6, %subreg.sub1
+ %8:sreg_64 = S_QUADMASK_B64 %7, implicit-def $scc
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_both_vgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_both_vgpr
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:sgpr_32 = IMPLICIT_DEF
+ %4:sgpr_32 = IMPLICIT_DEF
+ %5:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+ %6:sgpr_32 = V_READFIRSTLANE_B32 %5.sub0, implicit $exec
+ %7:sgpr_32 = V_READFIRSTLANE_B32 %5.sub1, implicit $exec
+ %8:sreg_64 = REG_SEQUENCE %6, %subreg.sub0, %7, %subreg.sub1
+ %9:sreg_64 = S_QUADMASK_B64 %8, implicit-def $scc
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_both_sgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_both_sgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1
+ %4:sreg_64 = S_QUADMASK_B64 %3, implicit-def $scc
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_vgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_vgpr
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub0, [[COPY1]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE2]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:vgpr_32 = COPY %0
+ %3:sgpr_32 = IMPLICIT_DEF
+ %4:sgpr_32 = IMPLICIT_DEF
+ %5:sgpr_32 = IMPLICIT_DEF
+ %6:vgpr_32 = COPY %1
+ %7:vreg_128 = REG_SEQUENCE killed %6, %subreg.sub1, %2, %subreg.sub0, %2, %subreg.sub2, %2, %subreg.sub3
+ %8:vreg_64 = REG_SEQUENCE killed %7.sub3, %subreg.sub1, %7.sub1, %subreg.sub0
+ %9:sgpr_32 = V_READFIRSTLANE_B32 %8.sub0, implicit $exec
+ %10:sgpr_32 = V_READFIRSTLANE_B32 %8.sub1, implicit $exec
+ %11:sreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1
+ %12:sreg_64 = S_QUADMASK_B64 killed %11, implicit-def $scc
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_sgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_sgpr
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[COPY]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[DEF]], %subreg.sub1, killed [[COPY1]], %subreg.sub0, killed [[COPY1]], %subreg.sub2, killed [[COPY1]], %subreg.sub3
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:sgpr_32 = COPY %0
+ %3:sgpr_128 = REG_SEQUENCE killed %1, %subreg.sub1, killed %2, %subreg.sub0, killed %2, %subreg.sub2, killed %2, %subreg.sub3
+ %4:sreg_64 = REG_SEQUENCE killed %3.sub3, %subreg.sub1, %3.sub1, %subreg.sub0
+ %5:sreg_64 = S_QUADMASK_B64 killed %4, implicit-def $scc
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_undef_subreg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_undef_subreg
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub1, [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub2
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE2]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr2
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr0
+ %3:sreg_32 = IMPLICIT_DEF
+ %4:sreg_32 = IMPLICIT_DEF
+ %5:sgpr_32 = IMPLICIT_DEF
+ %6:sgpr_32 = IMPLICIT_DEF
+ %7:vgpr_32 = COPY %3
+ %8:vreg_128 = REG_SEQUENCE killed %7, %subreg.sub1, %1, %subreg.sub0, %0, %subreg.sub2
+ %9:vreg_64 = REG_SEQUENCE killed %8.sub3, %subreg.sub1, %8.sub1, %subreg.sub0
+ %10:sgpr_32 = V_READFIRSTLANE_B32 %9.sub0, implicit $exec
+ %11:sgpr_32 = V_READFIRSTLANE_B32 %9.sub1, implicit $exec
+ %12:sreg_64 = REG_SEQUENCE %10, %subreg.sub0, %11, %subreg.sub1
+ %13:sreg_64 = S_QUADMASK_B64 killed %12, implicit-def $scc
+ S_ENDPGM 0
+
+...
+---
+name: fold_aligned_reg_into_required_aligned_reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_aligned_reg_into_required_aligned_reg
+ ; CHECK: S_NOP 0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:av_128_with_sub1_sub2_in_vreg_64_align2 = COPY undef renamable $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub1_sub2
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[DEF]], %subreg.sub2, [[DEF]], %subreg.sub3
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X4F16_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = V_MFMA_F32_4X4X4F16_vgprcd_e64 [[COPY]].sub1_sub2, [[COPY]].sub1_sub2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[V_MFMA_F32_4X4X4F16_vgprcd_e64_]], undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ S_NOP 0
+ %0:vreg_128 = COPY undef renamable $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:vreg_64_align2 = COPY %0.sub1_sub2
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vreg_128_align2 = REG_SEQUENCE %1, %subreg.sub0_sub1, %2, %subreg.sub2, %2, %subreg.sub3
+ %4:vreg_128_align2 = V_MFMA_F32_4X4X4F16_vgprcd_e64 %3.sub0_sub1, %3.sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ S_ENDPGM 0
+...
+---
+name: not_fold_into_ds_gws_align
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: $m0 = S_MOV_B32 0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[DEF]], %subreg.sub1
+ ; CHECK-NEXT: BUNDLE implicit [[REG_SEQUENCE]], implicit $m0, implicit $exec {
+ ; CHECK-NEXT: DS_GWS_INIT [[REG_SEQUENCE]].sub0, 0, implicit $m0, implicit $exec, implicit [[REG_SEQUENCE]] :: (store (s32) into custom "GWSResource")
+ ; CHECK-NEXT: S_WAITCNT 0
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY %0.sub1
+ $m0 = S_MOV_B32 0
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vreg_64_align2 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1
+ BUNDLE implicit %3, implicit $m0, implicit $exec {
+ DS_GWS_INIT %3.sub0, 0, implicit $m0, implicit $exec, implicit %3 :: (store (s32) into custom "GWSResource")
+ S_WAITCNT 0
+ }
+ S_ENDPGM 0
+...
+---
+name: fold_into_ds_gws_align
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: $m0 = S_MOV_B32 0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[DEF]], %subreg.sub1
+ ; CHECK-NEXT: BUNDLE implicit [[REG_SEQUENCE]], implicit $m0, implicit $exec {
+ ; CHECK-NEXT: DS_GWS_INIT [[COPY]].sub0, 0, implicit $m0, implicit $exec, implicit [[REG_SEQUENCE]] :: (store (s32) into custom "GWSResource")
+ ; CHECK-NEXT: S_WAITCNT 0
+ ; CHECK-NEXT: }
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY %0.sub0
+ $m0 = S_MOV_B32 0
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vreg_64_align2 = REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1
+ BUNDLE implicit %3, implicit $m0, implicit $exec {
+ DS_GWS_INIT %3.sub0, 0, implicit $m0, implicit $exec, implicit %3 :: (store (s32) into custom "GWSResource")
+ S_WAITCNT 0
+ }
+ S_ENDPGM 0
+...
+---
+name: tied_def_folding
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: tied_def
+ ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[DEF]], implicit $exec, implicit [[COPY]](tied-def 0)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]]
+ %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = COPY %1
+ %3:vgpr_32 = V_ADD_U32_e32 undef %0, %2:vgpr_32, implicit $exec, implicit %2:vgpr_32(tied-def 0)
+ GLOBAL_STORE_DWORD_SADDR %0, %0, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ S_ENDPGM 0, implicit %3
+...
+---
+name: tied_def_subreg_folding
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: tied_def_subreg
+ ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 10, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B]].sub1, %subreg.sub1
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[REG_SEQUENCE]].sub1, implicit $exec, implicit [[REG_SEQUENCE]].sub1(tied-def 0)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]]
+ %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1:vreg_64_align2 = V_MOV_B64_e32 10, implicit $exec
+ %2:vreg_64_align2 = COPY %1
+ %3:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %2.sub1, %subreg.sub1
+ %4:vgpr_32 = V_ADD_U32_e32 undef %0, %3.sub1, implicit $exec, implicit %3.sub1(tied-def 0)
+ GLOBAL_STORE_DWORD_SADDR %0, %0, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ S_ENDPGM 0, implicit %4
diff --git a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir
index 3ac463b4fb448..e2b1857a81729 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir
@@ -371,8 +371,6 @@ body: |
# GCN-NEXT: %2:vgpr_32 = COPY %0
# GCN-NEXT: %3:vgpr_32 = COPY %1
# GCN-NEXT: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, killed %3, %subreg.sub1
-# GCN-NEXT: %5:sreg_32_xm0 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec
-# GCN-NEXT: %6:sreg_32_xm0 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec
---
name: fold-copy-readfirstlane-regsequence1
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
index b9bf76c1423b6..7762b09abd72e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
@@ -260,3 +260,20 @@ define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrsp
store i32 %sel, ptr addrspace(1) null, align 4
ret void
}
+
+; See issue #125950
+define amdgpu_ps void @issue125950_test_quadmask_half_poison_i64(i32 %in, ptr %out) {
+; GFX11-LABEL: issue125950_test_quadmask_half_poison_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: flat_store_b64 v[1:2], v[3:4]
+; GFX11-NEXT: s_endpgm
+ %v1 = insertelement <2 x i32> <i32 poison, i32 poison>, i32 %in, i32 0
+ %v2 = bitcast <2 x i32> %v1 to i64
+ %v3 = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %v2)
+ %p = inttoptr i64 %v2 to ptr addrspace(4)
+ store i64 %v3, ptr %out
+ ret void
+}
More information about the llvm-commits
mailing list