[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Add COPY_SCC_VCC combine for VCC-SGPR-VGPR pattern (PR #179352)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Feb 25 18:01:30 PST 2026
https://github.com/vangthao95 updated https://github.com/llvm/llvm-project/pull/179352
>From 725933ff89089f614ffcf2e08f2e5ee895952889 Mon Sep 17 00:00:00 2001
From: Vang Thao <vang.thao at amd.com>
Date: Mon, 2 Feb 2026 14:15:20 -0800
Subject: [PATCH 1/7] [AMDGPU][GlobalISel] Add COPY_SCC_VCC combine for
VCC-SGPR-VGPR pattern
Eliminate VCC->SGPR->VGPR bounce created by UniInVcc when the uniform boolean
result is consumed by a VALU instruction that requires the input in VGPRs.
---
.../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 60 ++++
...regbanklegalize-eliminate-copy-scc-vcc.mir | 313 ++++++++++++++++++
2 files changed, 373 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbanklegalize-eliminate-copy-scc-vcc.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 888717f13ebe9..b9269266526e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -46,6 +46,12 @@ m_GAMDGPUReadAnyLane(const SrcTy &Src) {
return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
}
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_COPY_SCC_VCC>
+m_GAMDGPUCopySccVcc(const SrcTy &Src) {
+ return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_COPY_SCC_VCC>(Src);
+}
+
class AMDGPURegBankLegalize : public MachineFunctionPass {
public:
static char ID;
@@ -136,6 +142,7 @@ class AMDGPURegBankLegalizeCombiner {
bool tryEliminateReadAnyLane(MachineInstr &Copy);
void tryCombineCopy(MachineInstr &MI);
void tryCombineS1AnyExt(MachineInstr &MI);
+ bool tryEliminateCopySccVcc(MachineInstr &MI);
};
bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
@@ -396,6 +403,57 @@ void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
llvm_unreachable("missing anyext + trunc combine");
}
+bool AMDGPURegBankLegalizeCombiner::tryEliminateCopySccVcc(MachineInstr &MI) {
+ // Eliminate VCC->SGPR->VGPR bounce for uniform boolean extensions. This is
+ // caused by UniInVcc which creates G_AMDGPU_COPY_SCC_VCC forcing the result
+ // to SGPR which in turn is forced back to VGPR by a subsequent instruction.
+ //
+ // %vcc:vcc(s1) = ...
+ // %sgpr:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC %vcc
+ // %and:sgpr(s32) = G_AND %sgpr, 1
+ // %sel:sgpr(s32) = G_SELECT %and, {-1|1}, 0
+ // %vgpr:vgpr(s32) = COPY %sel
+ // ->
+ // %vgpr:vgpr(s32) = G_SELECT %vcc, {-1|1}, 0
+
+ Register VgprDst = MI.getOperand(0).getReg();
+ Register SgprSrc = MI.getOperand(1).getReg();
+
+ if (!VgprDst.isVirtual() || !SgprSrc.isVirtual())
+ return false;
+
+ if (MRI.getRegBankOrNull(VgprDst) != VgprRB ||
+ MRI.getRegBankOrNull(SgprSrc) != SgprRB)
+ return false;
+
+ // Match: G_SELECT (G_AND (G_AMDGPU_COPY_SCC_VCC %vcc), 1), SelTrueReg, 0
+ Register VccReg, SelTrueReg;
+ if (!mi_match(SgprSrc, MRI,
+ m_GISelect(m_GAnd(m_GAMDGPUCopySccVcc(m_Reg(VccReg)),
+ m_SpecificICst(1)),
+ m_Reg(SelTrueReg), m_ZeroInt())))
+ return false;
+
+ if (MRI.getRegBankOrNull(VccReg) != VccRB)
+ return false;
+
+ // SelTrueReg must be constant -1 (SEXT) or 1 (ZEXT).
+ auto SelTrueConst = getIConstantVRegValWithLookThrough(SelTrueReg, MRI);
+ if (!SelTrueConst)
+ return false;
+ int64_t SelTrueVal = SelTrueConst->Value.getSExtValue();
+ if (SelTrueVal != -1 && SelTrueVal != 1)
+ return false;
+
+ B.setInstrAndDebugLoc(MI);
+ LLT Ty = MRI.getType(VgprDst);
+ B.buildSelect(VgprDst, VccReg, B.buildConstant({VgprRB, Ty}, SelTrueVal),
+ B.buildConstant({VgprRB, Ty}, 0));
+
+ eraseInstr(MI, MRI);
+ return true;
+}
+
// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
const LLT S1 = LLT::scalar(1);
@@ -518,6 +576,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : make_early_inc_range(MBB)) {
if (MI.getOpcode() == AMDGPU::COPY) {
+ if (Combiner.tryEliminateCopySccVcc(MI))
+ continue;
Combiner.tryCombineCopy(MI);
continue;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbanklegalize-eliminate-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbanklegalize-eliminate-copy-scc-vcc.mir
new file mode 100644
index 0000000000000..ad89b8ba603c8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbanklegalize-eliminate-copy-scc-vcc.mir
@@ -0,0 +1,313 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
+
+---
+name: test_eliminate_sext_to_vgpr
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: test_eliminate_sext_to_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
+ ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[COPY1]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(p1) = COPY $sgpr2_sgpr3
+ %2:sgpr(s1) = G_IS_FPCLASS %0, 3
+ %3:sgpr(s32) = G_SEXT %2
+ G_STORE %3, %1 :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: test_eliminate_zext_to_vgpr
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: test_eliminate_zext_to_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
+ ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[COPY1]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(p1) = COPY $sgpr2_sgpr3
+ %2:sgpr(s1) = G_IS_FPCLASS %0, 3
+ %3:sgpr(s32) = G_ZEXT %2
+ G_STORE %3, %1 :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: test_eliminate_sext_to_s64_vgpr
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: test_eliminate_sext_to_s64_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 -1
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s64) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
+ ; CHECK-NEXT: G_STORE [[SELECT]](s64), [[COPY1]](p1) :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(p1) = COPY $sgpr2_sgpr3
+ %2:sgpr(s1) = G_IS_FPCLASS %0, 3
+ %3:sgpr(s64) = G_SEXT %2
+ G_STORE %3, %1 :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: test_eliminate_zext_to_s64_vgpr
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: test_eliminate_zext_to_s64_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s64) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
+ ; CHECK-NEXT: G_STORE [[SELECT]](s64), [[COPY1]](p1) :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(p1) = COPY $sgpr2_sgpr3
+ %2:sgpr(s1) = G_IS_FPCLASS %0, 3
+ %3:sgpr(s64) = G_ZEXT %2
+ G_STORE %3, %1 :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: test_eliminate_anyext_to_vgpr
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: test_eliminate_anyext_to_vgpr
+ ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
+ ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[IS_FPCLASS]](s1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_COPY_SCC_VCC]](s32)
+ ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[COPY1]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(p1) = COPY $sgpr2_sgpr3
+ %2:sgpr(s1) = G_IS_FPCLASS %0, 3
+ %3:sgpr(s32) = G_ANYEXT %2
+ G_STORE %3, %1 :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: test_eliminate_fcmp_source
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: test_eliminate_fcmp_source
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY3]](s32), [[COPY4]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C]]
+ ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = COPY $sgpr2_sgpr3
+ %3:sgpr(s1) = G_FCMP floatpred(oeq), %0, %1
+ %4:sgpr(s32) = G_ZEXT %3
+ G_STORE %4, %2 :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: test_eliminate_multiple_vgpr_copies
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr2_sgpr3, $sgpr4_sgpr5
+ ; CHECK-LABEL: name: test_eliminate_multiple_vgpr_copies
+ ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3, $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p1) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY3]](s32), 3
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
+ ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[COPY1]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
+ ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(p1) = COPY $sgpr2_sgpr3
+ %2:sgpr(p1) = COPY $sgpr4_sgpr5
+ %3:sgpr(s1) = G_IS_FPCLASS %0, 3
+ %4:sgpr(s32) = G_SEXT %3
+ G_STORE %4, %1 :: (store (s32), addrspace 1)
+ G_STORE %4, %2 :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: test_mixed_uses
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: test_mixed_uses
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY3]](s32), 3
+ ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[IS_FPCLASS]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[AMDGPU_COPY_SCC_VCC]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SELECT]], [[COPY1]]
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[IS_FPCLASS]](s1), [[C4]], [[C3]]
+ ; CHECK-NEXT: G_STORE [[SELECT1]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
+ ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = COPY $sgpr2_sgpr3
+ %3:sgpr(s1) = G_IS_FPCLASS %0, 3
+ %4:sgpr(s32) = G_SEXT %3
+ %5:sgpr(s32) = G_ADD %4, %1
+ G_STORE %4, %2 :: (store (s32), addrspace 1)
+ G_STORE %5, %2 :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: test_no_eliminate_branch
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ ; CHECK-LABEL: name: test_no_eliminate_branch
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY1]](s32), 3
+ ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[IS_FPCLASS]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[AMDGPU_COPY_SCC_VCC]], [[C]]
+ ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $sgpr0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s1) = G_IS_FPCLASS %0, 3
+ G_BRCOND %1, %bb.1
+
+ bb.1:
+ S_ENDPGM 0
+...
+
+---
+name: test_no_eliminate_scalar_arithmetic_use
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
+ ; CHECK-LABEL: name: test_no_eliminate_scalar_arithmetic_use
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY3]](s32), 3
+ ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[IS_FPCLASS]](s1)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[AMDGPU_COPY_SCC_VCC]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SELECT]], [[COPY1]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
+ ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(p1) = COPY $sgpr2_sgpr3
+ %3:sgpr(s1) = G_IS_FPCLASS %0, 3
+ %4:sgpr(s32) = G_SEXT %3
+ %5:sgpr(s32) = G_ADD %4, %1
+ G_STORE %5, %2 :: (store (s32), addrspace 1)
+ S_ENDPGM 0
+...
>From 0d9f479a20303c3c5cc85a0ffcf17440c23adcef Mon Sep 17 00:00:00 2001
From: Vang Thao <vang.thao at amd.com>
Date: Thu, 5 Feb 2026 16:12:27 -0800
Subject: [PATCH 2/7] Move combine into AMDGPURegBankCombiner
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 10 +-
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 67 ++++
.../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 60 ----
...regbanklegalize-eliminate-copy-scc-vcc.mir | 313 ------------------
4 files changed, 76 insertions(+), 374 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbanklegalize-eliminate-copy-scc-vcc.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 7f00eadbf3f3f..4bc9a17b520fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -101,6 +101,14 @@ def fmed3_intrinsic_to_clamp : GICombineRule<
[{ return matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]),
(apply [{ applyClamp(*${fmed3}, ${matchinfo}); }])>;
+def copy_scc_vcc_matchdata : GIDefMatchData<"CopySccVccMatchInfo">;
+
+def copy_scc_vcc : GICombineRule<
+ (defs root:$copy, copy_scc_vcc_matchdata:$matchinfo),
+ (match (wip_match_opcode COPY):$copy,
+ [{ return matchCopySccVcc(*${copy}, ${matchinfo}); }]),
+ (apply [{ applyCopySccVcc(*${copy}, ${matchinfo}); }])>;
+
def remove_fcanonicalize : GICombineRule<
(defs root:$fcanonicalize, register_matchinfo:$matchinfo),
(match (wip_match_opcode G_FCANONICALIZE):$fcanonicalize,
@@ -239,5 +247,5 @@ def AMDGPURegBankCombiner : GICombiner<
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
- d16_load]> {
+ d16_load, copy_scc_vcc]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index e2e84ce2e6219..4f47b5a2a8488 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -74,6 +74,11 @@ class AMDGPURegBankCombinerImpl : public Combiner {
Register Val0, Val1, Val2;
};
+ struct CopySccVccMatchInfo {
+ Register VccReg;
+ int64_t TrueVal; // -1 for SEXT, 1 for ZEXT
+ };
+
MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
template <class m_Cst, typename CstTy>
@@ -93,6 +98,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
bool applyD16Load(unsigned D16Opc, MachineInstr &DstMI,
MachineInstr *SmallLoad, Register ToOverwriteD16) const;
+ bool matchCopySccVcc(MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const;
+ void applyCopySccVcc(MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -478,6 +486,65 @@ bool AMDGPURegBankCombinerImpl::applyD16Load(
return true;
}
+// Eliminate VCC->SGPR->VGPR register bounce for uniform boolean extensions.
+// Match: COPY (G_SELECT (G_AND (G_AMDGPU_COPY_SCC_VCC %vcc), 1), {-1|1}, 0)
+// Replace with: G_SELECT %vcc, {-1|1}, 0
+bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
+ MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const {
+ assert(MI.getOpcode() == AMDGPU::COPY);
+
+ Register VgprDst = MI.getOperand(0).getReg();
+ Register SgprSrc = MI.getOperand(1).getReg();
+
+ if (!VgprDst.isVirtual() || !SgprSrc.isVirtual())
+ return false;
+
+ if (!isVgprRegBank(VgprDst))
+ return false;
+
+ // Match: G_SELECT (G_AND (G_AMDGPU_COPY_SCC_VCC %vcc), 1), TrueReg, 0
+ MachineInstr *CopySccVcc;
+ Register TrueReg;
+ if (!mi_match(SgprSrc, MRI,
+ m_GISelect(m_GAnd(m_MInstr(CopySccVcc), m_SpecificICst(1)),
+ m_Reg(TrueReg), m_ZeroInt())))
+ return false;
+
+ if (CopySccVcc->getOpcode() != AMDGPU::G_AMDGPU_COPY_SCC_VCC)
+ return false;
+
+ Register VccReg = CopySccVcc->getOperand(1).getReg();
+
+ // TrueReg must be constant -1 (SEXT) or 1 (ZEXT)
+ auto TrueConst = getIConstantVRegValWithLookThrough(TrueReg, MRI);
+ if (!TrueConst)
+ return false;
+
+ int64_t TrueVal = TrueConst->Value.getSExtValue();
+ if (TrueVal != -1 && TrueVal != 1)
+ return false;
+
+ MatchInfo.VccReg = VccReg;
+ MatchInfo.TrueVal = TrueVal;
+ return true;
+}
+
+void AMDGPURegBankCombinerImpl::applyCopySccVcc(
+ MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const {
+ Register VgprDst = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(VgprDst);
+ const RegisterBank &VgprRB = RBI.getRegBank(AMDGPU::VGPRRegBankID);
+
+ auto TrueVal = B.buildConstant(Ty, MatchInfo.TrueVal);
+ MRI.setRegBank(TrueVal.getReg(0), VgprRB);
+
+ auto FalseVal = B.buildConstant(Ty, 0);
+ MRI.setRegBank(FalseVal.getReg(0), VgprRB);
+
+ B.buildSelect(VgprDst, MatchInfo.VccReg, TrueVal, FalseVal);
+ MI.eraseFromParent();
+}
+
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index b9269266526e3..888717f13ebe9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -46,12 +46,6 @@ m_GAMDGPUReadAnyLane(const SrcTy &Src) {
return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
}
-template <typename SrcTy>
-inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_COPY_SCC_VCC>
-m_GAMDGPUCopySccVcc(const SrcTy &Src) {
- return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_COPY_SCC_VCC>(Src);
-}
-
class AMDGPURegBankLegalize : public MachineFunctionPass {
public:
static char ID;
@@ -142,7 +136,6 @@ class AMDGPURegBankLegalizeCombiner {
bool tryEliminateReadAnyLane(MachineInstr &Copy);
void tryCombineCopy(MachineInstr &MI);
void tryCombineS1AnyExt(MachineInstr &MI);
- bool tryEliminateCopySccVcc(MachineInstr &MI);
};
bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
@@ -403,57 +396,6 @@ void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
llvm_unreachable("missing anyext + trunc combine");
}
-bool AMDGPURegBankLegalizeCombiner::tryEliminateCopySccVcc(MachineInstr &MI) {
- // Eliminate VCC->SGPR->VGPR bounce for uniform boolean extensions. This is
- // caused by UniInVcc which creates G_AMDGPU_COPY_SCC_VCC forcing the result
- // to SGPR which in turn is forced back to VGPR by a subsequent instruction.
- //
- // %vcc:vcc(s1) = ...
- // %sgpr:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC %vcc
- // %and:sgpr(s32) = G_AND %sgpr, 1
- // %sel:sgpr(s32) = G_SELECT %and, {-1|1}, 0
- // %vgpr:vgpr(s32) = COPY %sel
- // ->
- // %vgpr:vgpr(s32) = G_SELECT %vcc, {-1|1}, 0
-
- Register VgprDst = MI.getOperand(0).getReg();
- Register SgprSrc = MI.getOperand(1).getReg();
-
- if (!VgprDst.isVirtual() || !SgprSrc.isVirtual())
- return false;
-
- if (MRI.getRegBankOrNull(VgprDst) != VgprRB ||
- MRI.getRegBankOrNull(SgprSrc) != SgprRB)
- return false;
-
- // Match: G_SELECT (G_AND (G_AMDGPU_COPY_SCC_VCC %vcc), 1), SelTrueReg, 0
- Register VccReg, SelTrueReg;
- if (!mi_match(SgprSrc, MRI,
- m_GISelect(m_GAnd(m_GAMDGPUCopySccVcc(m_Reg(VccReg)),
- m_SpecificICst(1)),
- m_Reg(SelTrueReg), m_ZeroInt())))
- return false;
-
- if (MRI.getRegBankOrNull(VccReg) != VccRB)
- return false;
-
- // SelTrueReg must be constant -1 (SEXT) or 1 (ZEXT).
- auto SelTrueConst = getIConstantVRegValWithLookThrough(SelTrueReg, MRI);
- if (!SelTrueConst)
- return false;
- int64_t SelTrueVal = SelTrueConst->Value.getSExtValue();
- if (SelTrueVal != -1 && SelTrueVal != 1)
- return false;
-
- B.setInstrAndDebugLoc(MI);
- LLT Ty = MRI.getType(VgprDst);
- B.buildSelect(VgprDst, VccReg, B.buildConstant({VgprRB, Ty}, SelTrueVal),
- B.buildConstant({VgprRB, Ty}, 0));
-
- eraseInstr(MI, MRI);
- return true;
-}
-
// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
const LLT S1 = LLT::scalar(1);
@@ -576,8 +518,6 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : make_early_inc_range(MBB)) {
if (MI.getOpcode() == AMDGPU::COPY) {
- if (Combiner.tryEliminateCopySccVcc(MI))
- continue;
Combiner.tryCombineCopy(MI);
continue;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbanklegalize-eliminate-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbanklegalize-eliminate-copy-scc-vcc.mir
deleted file mode 100644
index ad89b8ba603c8..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbanklegalize-eliminate-copy-scc-vcc.mir
+++ /dev/null
@@ -1,313 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
-
----
-name: test_eliminate_sext_to_vgpr
-legalized: true
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-LABEL: name: test_eliminate_sext_to_vgpr
- ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
- ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[COPY1]](p1) :: (store (s32), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(p1) = COPY $sgpr2_sgpr3
- %2:sgpr(s1) = G_IS_FPCLASS %0, 3
- %3:sgpr(s32) = G_SEXT %2
- G_STORE %3, %1 :: (store (s32), addrspace 1)
- S_ENDPGM 0
-...
-
----
-name: test_eliminate_zext_to_vgpr
-legalized: true
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-LABEL: name: test_eliminate_zext_to_vgpr
- ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
- ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[COPY1]](p1) :: (store (s32), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(p1) = COPY $sgpr2_sgpr3
- %2:sgpr(s1) = G_IS_FPCLASS %0, 3
- %3:sgpr(s32) = G_ZEXT %2
- G_STORE %3, %1 :: (store (s32), addrspace 1)
- S_ENDPGM 0
-...
-
----
-name: test_eliminate_sext_to_s64_vgpr
-legalized: true
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-LABEL: name: test_eliminate_sext_to_s64_vgpr
- ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 -1
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s64) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
- ; CHECK-NEXT: G_STORE [[SELECT]](s64), [[COPY1]](p1) :: (store (s64), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(p1) = COPY $sgpr2_sgpr3
- %2:sgpr(s1) = G_IS_FPCLASS %0, 3
- %3:sgpr(s64) = G_SEXT %2
- G_STORE %3, %1 :: (store (s64), addrspace 1)
- S_ENDPGM 0
-...
-
----
-name: test_eliminate_zext_to_s64_vgpr
-legalized: true
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-LABEL: name: test_eliminate_zext_to_s64_vgpr
- ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 1
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s64) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
- ; CHECK-NEXT: G_STORE [[SELECT]](s64), [[COPY1]](p1) :: (store (s64), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(p1) = COPY $sgpr2_sgpr3
- %2:sgpr(s1) = G_IS_FPCLASS %0, 3
- %3:sgpr(s64) = G_ZEXT %2
- G_STORE %3, %1 :: (store (s64), addrspace 1)
- S_ENDPGM 0
-...
-
----
-name: test_eliminate_anyext_to_vgpr
-legalized: true
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-LABEL: name: test_eliminate_anyext_to_vgpr
- ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY2]](s32), 3
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[IS_FPCLASS]](s1)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_COPY_SCC_VCC]](s32)
- ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[COPY1]](p1) :: (store (s32), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(p1) = COPY $sgpr2_sgpr3
- %2:sgpr(s1) = G_IS_FPCLASS %0, 3
- %3:sgpr(s32) = G_ANYEXT %2
- G_STORE %3, %1 :: (store (s32), addrspace 1)
- S_ENDPGM 0
-...
-
----
-name: test_eliminate_fcmp_source
-legalized: true
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
- ; CHECK-LABEL: name: test_eliminate_fcmp_source
- ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY3]](s32), [[COPY4]]
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C]]
- ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(s32) = COPY $sgpr1
- %2:sgpr(p1) = COPY $sgpr2_sgpr3
- %3:sgpr(s1) = G_FCMP floatpred(oeq), %0, %1
- %4:sgpr(s32) = G_ZEXT %3
- G_STORE %4, %2 :: (store (s32), addrspace 1)
- S_ENDPGM 0
-...
-
----
-name: test_eliminate_multiple_vgpr_copies
-legalized: true
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr2_sgpr3, $sgpr4_sgpr5
- ; CHECK-LABEL: name: test_eliminate_multiple_vgpr_copies
- ; CHECK: liveins: $sgpr0, $sgpr2_sgpr3, $sgpr4_sgpr5
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p1) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY3]](s32), 3
- ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[IS_FPCLASS]](s1), [[C1]], [[C]]
- ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[COPY1]](p1) :: (store (s32), addrspace 1)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(p1) = COPY $sgpr2_sgpr3
- %2:sgpr(p1) = COPY $sgpr4_sgpr5
- %3:sgpr(s1) = G_IS_FPCLASS %0, 3
- %4:sgpr(s32) = G_SEXT %3
- G_STORE %4, %1 :: (store (s32), addrspace 1)
- G_STORE %4, %2 :: (store (s32), addrspace 1)
- S_ENDPGM 0
-...
-
----
-name: test_mixed_uses
-legalized: true
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
- ; CHECK-LABEL: name: test_mixed_uses
- ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY3]](s32), 3
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[IS_FPCLASS]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[AMDGPU_COPY_SCC_VCC]], [[C]]
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]]
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SELECT]], [[COPY1]]
- ; CHECK-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[C4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[IS_FPCLASS]](s1), [[C4]], [[C3]]
- ; CHECK-NEXT: G_STORE [[SELECT1]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
- ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(s32) = COPY $sgpr1
- %2:sgpr(p1) = COPY $sgpr2_sgpr3
- %3:sgpr(s1) = G_IS_FPCLASS %0, 3
- %4:sgpr(s32) = G_SEXT %3
- %5:sgpr(s32) = G_ADD %4, %1
- G_STORE %4, %2 :: (store (s32), addrspace 1)
- G_STORE %5, %2 :: (store (s32), addrspace 1)
- S_ENDPGM 0
-...
-
----
-name: test_no_eliminate_branch
-legalized: true
-tracksRegLiveness: true
-
-body: |
- ; CHECK-LABEL: name: test_no_eliminate_branch
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: liveins: $sgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY1]](s32), 3
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[IS_FPCLASS]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[AMDGPU_COPY_SCC_VCC]], [[C]]
- ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: S_ENDPGM 0
- bb.0:
- liveins: $sgpr0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(s1) = G_IS_FPCLASS %0, 3
- G_BRCOND %1, %bb.1
-
- bb.1:
- S_ENDPGM 0
-...
-
----
-name: test_no_eliminate_scalar_arithmetic_use
-legalized: true
-tracksRegLiveness: true
-
-body: |
- bb.0:
- liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
- ; CHECK-LABEL: name: test_no_eliminate_scalar_arithmetic_use
- ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2_sgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY3]](s32), 3
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[IS_FPCLASS]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[AMDGPU_COPY_SCC_VCC]], [[C]]
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]]
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SELECT]], [[COPY1]]
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
- ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY2]](p1) :: (store (s32), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:sgpr(s32) = COPY $sgpr0
- %1:sgpr(s32) = COPY $sgpr1
- %2:sgpr(p1) = COPY $sgpr2_sgpr3
- %3:sgpr(s1) = G_IS_FPCLASS %0, 3
- %4:sgpr(s32) = G_SEXT %3
- %5:sgpr(s32) = G_ADD %4, %1
- G_STORE %5, %2 :: (store (s32), addrspace 1)
- S_ENDPGM 0
-...
>From fe7d2980032179ff1a5ad22dceb4249747f95a11 Mon Sep 17 00:00:00 2001
From: Vang Thao <vang.thao at amd.com>
Date: Fri, 6 Feb 2026 23:34:34 -0800
Subject: [PATCH 3/7] Adjust combine pattern.
---
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 60 +++++++--------
.../GlobalISel/regbankcombiner-smed3.mir | 76 +++++++------------
.../GlobalISel/regbankcombiner-umed3.mir | 76 +++++++------------
llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll | 24 +++---
llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll | 68 ++++++++---------
5 files changed, 132 insertions(+), 172 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 4f47b5a2a8488..0829867262218 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -76,7 +76,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
struct CopySccVccMatchInfo {
Register VccReg;
- int64_t TrueVal; // -1 for SEXT, 1 for ZEXT
+ Register TrueReg;
+ Register FalseReg;
};
MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
@@ -145,6 +146,17 @@ Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
if (isVgprRegBank(Reg))
return Reg;
+ const RegisterBank &VgprRB = RBI.getRegBank(AMDGPU::VGPRRegBankID);
+
+ // Build constants directly in VGPR instead of copying from SGPR.
+ std::optional<ValueAndVReg> Val =
+ getIConstantVRegValWithLookThrough(Reg, MRI);
+ if (Val) {
+ auto VgprCst = B.buildConstant(MRI.getType(Reg), Val->Value);
+ MRI.setRegBank(VgprCst.getReg(0), VgprRB);
+ return VgprCst.getReg(0);
+ }
+
// Search for existing copy of Reg to vgpr.
for (MachineInstr &Use : MRI.use_instructions(Reg)) {
Register Def = Use.getOperand(0).getReg();
@@ -154,7 +166,7 @@ Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
// Copy Reg to vgpr.
Register VgprReg = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
- MRI.setRegBank(VgprReg, RBI.getRegBank(AMDGPU::VGPRRegBankID));
+ MRI.setRegBank(VgprReg, VgprRB);
return VgprReg;
}
@@ -486,9 +498,9 @@ bool AMDGPURegBankCombinerImpl::applyD16Load(
return true;
}
-// Eliminate VCC->SGPR->VGPR register bounce for uniform boolean extensions.
-// Match: COPY (G_SELECT (G_AND (G_AMDGPU_COPY_SCC_VCC %vcc), 1), {-1|1}, 0)
-// Replace with: G_SELECT %vcc, {-1|1}, 0
+// Eliminate VCC->SGPR->VGPR register bounce for uniform boolean in VCC.
+// Match: COPY (G_SELECT (G_AMDGPU_COPY_SCC_VCC %vcc), %true, %false)
+// Replace with: G_SELECT %vcc, %vgpr_true, %vgpr_false
bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const {
assert(MI.getOpcode() == AMDGPU::COPY);
@@ -502,46 +514,30 @@ bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
if (!isVgprRegBank(VgprDst))
return false;
- // Match: G_SELECT (G_AND (G_AMDGPU_COPY_SCC_VCC %vcc), 1), TrueReg, 0
- MachineInstr *CopySccVcc;
- Register TrueReg;
+ MachineInstr *CondDef;
+ Register TrueReg, FalseReg;
if (!mi_match(SgprSrc, MRI,
- m_GISelect(m_GAnd(m_MInstr(CopySccVcc), m_SpecificICst(1)),
- m_Reg(TrueReg), m_ZeroInt())))
+ m_GISelect(m_MInstr(CondDef), m_Reg(TrueReg),
+ m_Reg(FalseReg))))
return false;
- if (CopySccVcc->getOpcode() != AMDGPU::G_AMDGPU_COPY_SCC_VCC)
+ if (CondDef->getOpcode() != AMDGPU::G_AMDGPU_COPY_SCC_VCC)
return false;
- Register VccReg = CopySccVcc->getOperand(1).getReg();
-
- // TrueReg must be constant -1 (SEXT) or 1 (ZEXT)
- auto TrueConst = getIConstantVRegValWithLookThrough(TrueReg, MRI);
- if (!TrueConst)
- return false;
-
- int64_t TrueVal = TrueConst->Value.getSExtValue();
- if (TrueVal != -1 && TrueVal != 1)
- return false;
-
- MatchInfo.VccReg = VccReg;
- MatchInfo.TrueVal = TrueVal;
+ MatchInfo.VccReg = CondDef->getOperand(1).getReg();
+ MatchInfo.TrueReg = TrueReg;
+ MatchInfo.FalseReg = FalseReg;
return true;
}
void AMDGPURegBankCombinerImpl::applyCopySccVcc(
MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const {
Register VgprDst = MI.getOperand(0).getReg();
- LLT Ty = MRI.getType(VgprDst);
- const RegisterBank &VgprRB = RBI.getRegBank(AMDGPU::VGPRRegBankID);
-
- auto TrueVal = B.buildConstant(Ty, MatchInfo.TrueVal);
- MRI.setRegBank(TrueVal.getReg(0), VgprRB);
- auto FalseVal = B.buildConstant(Ty, 0);
- MRI.setRegBank(FalseVal.getReg(0), VgprRB);
+ Register VgprTrue = getAsVgpr(MatchInfo.TrueReg);
+ Register VgprFalse = getAsVgpr(MatchInfo.FalseReg);
- B.buildSelect(VgprDst, MatchInfo.VccReg, TrueVal, FalseVal);
+ B.buildSelect(VgprDst, MatchInfo.VccReg, VgprTrue, VgprFalse);
MI.eraseFromParent();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir
index f18a576b56250..197d586e1502e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir
@@ -15,11 +15,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -46,11 +44,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -77,11 +73,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -108,11 +102,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -139,11 +131,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -170,11 +160,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -201,11 +189,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -232,11 +218,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -304,8 +288,8 @@ body: |
; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
; CHECK-NEXT: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SMAX]], [[C1]]
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[SMIN]](s32)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%0:sgpr(s32) = COPY $sgpr2
%3:sgpr(s32) = G_CONSTANT i32 -12
@@ -331,11 +315,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 65
+ ; CHECK-NEXT: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir
index ef05a5274462a..ec0b78101f577 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir
@@ -15,11 +15,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -46,11 +44,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -77,11 +73,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -108,11 +102,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -139,11 +131,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -170,11 +160,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -201,11 +189,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -232,11 +218,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 17
+ ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
@@ -304,8 +288,8 @@ body: |
; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17
; CHECK-NEXT: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[UMAX]], [[C1]]
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UMIN]](s32)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%0:sgpr(s32) = COPY $sgpr2
%3:sgpr(s32) = G_CONSTANT i32 12
@@ -332,11 +316,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 65
+ ; CHECK-NEXT: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll b/llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll
index 20ac23daebba2..0a8f19a452fef 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll
@@ -84,8 +84,8 @@ define i8 @test_signed_i8_f32(float %f) nounwind {
; GFX12-GI-NEXT: s_wait_bvhcnt 0x0
; GFX12-GI-NEXT: s_wait_kmcnt 0x0
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0xffffff80
-; GFX12-GI-NEXT: v_med3_i32 v0, v0, v1, 0x7f
+; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0x7f
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffffff80, v0, v1
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call i8 @llvm.fptosi.sat.i8.f32(float %f)
ret i8 %x
@@ -130,8 +130,8 @@ define i16 @test_signed_i16_f32(float %f) nounwind {
; GFX12-GI-NEXT: s_wait_bvhcnt 0x0
; GFX12-GI-NEXT: s_wait_kmcnt 0x0
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GFX12-GI-NEXT: v_med3_i32 v0, v0, v1, 0x7fff
+; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0x7fff
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffff8000, v0, v1
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call i16 @llvm.fptosi.sat.i16.f32(float %f)
ret i16 %x
@@ -367,8 +367,8 @@ define i8 @test_signed_i8_f64(double %f) nounwind {
; GFX12-GI-NEXT: s_wait_bvhcnt 0x0
; GFX12-GI-NEXT: s_wait_kmcnt 0x0
; GFX12-GI-NEXT: v_cvt_i32_f64_e32 v0, v[0:1]
-; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0xffffff80
-; GFX12-GI-NEXT: v_med3_i32 v0, v0, v1, 0x7f
+; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0x7f
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffffff80, v0, v1
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call i8 @llvm.fptosi.sat.i8.f64(double %f)
ret i8 %x
@@ -413,8 +413,8 @@ define i16 @test_signed_i16_f64(double %f) nounwind {
; GFX12-GI-NEXT: s_wait_bvhcnt 0x0
; GFX12-GI-NEXT: s_wait_kmcnt 0x0
; GFX12-GI-NEXT: v_cvt_i32_f64_e32 v0, v[0:1]
-; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GFX12-GI-NEXT: v_med3_i32 v0, v0, v1, 0x7fff
+; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0x7fff
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffff8000, v0, v1
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call i16 @llvm.fptosi.sat.i16.f64(double %f)
ret i16 %x
@@ -697,9 +697,9 @@ define i8 @test_signed_i8_f16(half %f) nounwind {
; GFX12-GI-NEXT: s_wait_bvhcnt 0x0
; GFX12-GI-NEXT: s_wait_kmcnt 0x0
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v0, v0.l
-; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0xffffff80
+; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0x7f
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX12-GI-NEXT: v_med3_i32 v0, v0, v1, 0x7f
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffffff80, v0, v1
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call i8 @llvm.fptosi.sat.i8.f16(half %f)
ret i8 %x
@@ -770,9 +770,9 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
; GFX12-GI-NEXT: s_wait_bvhcnt 0x0
; GFX12-GI-NEXT: s_wait_kmcnt 0x0
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v0, v0.l
-; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0xffff8000
+; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0x7fff
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX12-GI-NEXT: v_med3_i32 v0, v0, v1, 0x7fff
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffff8000, v0, v1
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call i16 @llvm.fptosi.sat.i16.f16(half %f)
ret i16 %x
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll b/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll
index 1bfc5798f15c9..5119d65025220 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll
@@ -657,9 +657,9 @@ define <2 x i8> @test_signed_v2f64_v2i8(<2 x double> %f) {
; GFX12-GI-NEXT: s_wait_kmcnt 0x0
; GFX12-GI-NEXT: v_cvt_i32_f64_e32 v0, v[0:1]
; GFX12-GI-NEXT: v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX12-GI-NEXT: v_mov_b32_e32 v2, 0xffffff80
-; GFX12-GI-NEXT: v_med3_i32 v0, v0, v2, 0x7f
-; GFX12-GI-NEXT: v_med3_i32 v1, v1, v2, 0x7f
+; GFX12-GI-NEXT: v_mov_b32_e32 v2, 0x7f
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffffff80, v0, v2
+; GFX12-GI-NEXT: v_med3_i32 v1, 0xffffff80, v1, v2
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call <2 x i8> @llvm.fptosi.sat.v2f64.v2i8(<2 x double> %f)
ret <2 x i8> %x
@@ -742,9 +742,9 @@ define <2 x i16> @test_signed_v2f64_v2i16(<2 x double> %f) {
; GFX12-GI-NEXT: s_wait_kmcnt 0x0
; GFX12-GI-NEXT: v_cvt_i32_f64_e32 v2, v[2:3]
; GFX12-GI-NEXT: v_cvt_i32_f64_e32 v0, v[0:1]
-; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GFX12-GI-NEXT: v_med3_i32 v2, v2, v1, 0x7fff
-; GFX12-GI-NEXT: v_med3_i32 v0, v0, v1, 0x7fff
+; GFX12-GI-NEXT: v_mov_b32_e32 v1, 0x7fff
+; GFX12-GI-NEXT: v_med3_i32 v2, 0xffff8000, v2, v1
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffff8000, v0, v1
; GFX12-GI-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call <2 x i16> @llvm.fptosi.sat.v2f64.v2i16(<2 x double> %f)
@@ -1252,15 +1252,15 @@ define <4 x i8> @test_signed_v4f16_v4i8(<4 x half> %f) {
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v0, v0.h
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v3, v1.l
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v1, v1.h
-; GFX12-GI-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX12-GI-NEXT: v_mov_b32_e32 v4, 0x7f
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v5, v0
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v6, v1
-; GFX12-GI-NEXT: v_med3_i32 v0, v2, v4, 0x7f
-; GFX12-GI-NEXT: v_med3_i32 v1, v5, v4, 0x7f
-; GFX12-GI-NEXT: v_med3_i32 v2, v3, v4, 0x7f
-; GFX12-GI-NEXT: v_med3_i32 v3, v6, v4, 0x7f
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffffff80, v2, v4
+; GFX12-GI-NEXT: v_med3_i32 v1, 0xffffff80, v5, v4
+; GFX12-GI-NEXT: v_med3_i32 v2, 0xffffff80, v3, v4
+; GFX12-GI-NEXT: v_med3_i32 v3, 0xffffff80, v6, v4
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call <4 x i8> @llvm.fptosi.sat.v4f16.v4i8(<4 x half> %f)
ret <4 x i8> %x
@@ -1394,15 +1394,15 @@ define <4 x i16> @test_signed_v4f16_v4i16(<4 x half> %f) {
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v3, v1.h
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v1, v1.l
-; GFX12-GI-NEXT: v_mov_b32_e32 v4, 0xffff8000
+; GFX12-GI-NEXT: v_mov_b32_e32 v4, 0x7fff
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX12-GI-NEXT: v_med3_i32 v2, v2, v4, 0x7fff
-; GFX12-GI-NEXT: v_med3_i32 v3, v3, v4, 0x7fff
-; GFX12-GI-NEXT: v_med3_i32 v0, v0, v4, 0x7fff
-; GFX12-GI-NEXT: v_med3_i32 v1, v1, v4, 0x7fff
+; GFX12-GI-NEXT: v_med3_i32 v2, 0xffff8000, v2, v4
+; GFX12-GI-NEXT: v_med3_i32 v3, 0xffff8000, v3, v4
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX12-GI-NEXT: v_med3_i32 v1, 0xffff8000, v1, v4
; GFX12-GI-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX12-GI-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
@@ -2132,29 +2132,29 @@ define <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) {
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v4, v0.l
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v0, v0.h
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v5, v1.l
-; GFX12-GI-NEXT: v_mov_b32_e32 v7, 0xffffff80
+; GFX12-GI-NEXT: v_mov_b32_e32 v7, 0x7f
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v9, v1.h
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v6, v0
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v5, v5
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v10, v2.l
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v2, v2.h
-; GFX12-GI-NEXT: v_med3_i32 v0, v4, v7, 0x7f
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffffff80, v4, v7
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v4, v3.l
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v3, v3.h
-; GFX12-GI-NEXT: v_med3_i32 v1, v6, v7, 0x7f
-; GFX12-GI-NEXT: v_med3_i32 v8, v5, v7, 0x7f
+; GFX12-GI-NEXT: v_med3_i32 v1, 0xffffff80, v6, v7
+; GFX12-GI-NEXT: v_med3_i32 v8, 0xffffff80, v5, v7
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v5, v9
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v6, v10
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v9, v4
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v10, v3
-; GFX12-GI-NEXT: v_med3_i32 v3, v5, v7, 0x7f
-; GFX12-GI-NEXT: v_med3_i32 v5, v2, v7, 0x7f
+; GFX12-GI-NEXT: v_med3_i32 v3, 0xffffff80, v5, v7
+; GFX12-GI-NEXT: v_med3_i32 v5, 0xffffff80, v2, v7
; GFX12-GI-NEXT: v_mov_b32_e32 v2, v8
-; GFX12-GI-NEXT: v_med3_i32 v4, v6, v7, 0x7f
-; GFX12-GI-NEXT: v_med3_i32 v6, v9, v7, 0x7f
-; GFX12-GI-NEXT: v_med3_i32 v7, v10, v7, 0x7f
+; GFX12-GI-NEXT: v_med3_i32 v4, 0xffffff80, v6, v7
+; GFX12-GI-NEXT: v_med3_i32 v6, 0xffffff80, v9, v7
+; GFX12-GI-NEXT: v_med3_i32 v7, 0xffffff80, v10, v7
; GFX12-GI-NEXT: s_setpc_b64 s[30:31]
%x = call <8 x i8> @llvm.fptosi.sat.v8f16.v8i8(<8 x half> %f)
ret <8 x i8> %x
@@ -2363,28 +2363,28 @@ define <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) {
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v5, v1.l
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v4, v0.l
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v0, v0.h
-; GFX12-GI-NEXT: v_mov_b32_e32 v6, 0xffff8000
+; GFX12-GI-NEXT: v_mov_b32_e32 v6, 0x7fff
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v8, v1.h
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v5, v5
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v7, v0
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v9, v2.l
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v2, v2.h
-; GFX12-GI-NEXT: v_med3_i32 v1, v5, v6, 0x7fff
+; GFX12-GI-NEXT: v_med3_i32 v1, 0xffff8000, v5, v6
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v5, v3.h
; GFX12-GI-NEXT: v_cvt_f32_f16_e32 v3, v3.l
-; GFX12-GI-NEXT: v_med3_i32 v0, v4, v6, 0x7fff
-; GFX12-GI-NEXT: v_med3_i32 v4, v7, v6, 0x7fff
+; GFX12-GI-NEXT: v_med3_i32 v0, 0xffff8000, v4, v6
+; GFX12-GI-NEXT: v_med3_i32 v4, 0xffff8000, v7, v6
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v7, v8
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v8, v9
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v9, v2
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v5, v5
; GFX12-GI-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX12-GI-NEXT: v_med3_i32 v7, v7, v6, 0x7fff
-; GFX12-GI-NEXT: v_med3_i32 v2, v8, v6, 0x7fff
-; GFX12-GI-NEXT: v_med3_i32 v8, v9, v6, 0x7fff
-; GFX12-GI-NEXT: v_med3_i32 v5, v5, v6, 0x7fff
-; GFX12-GI-NEXT: v_med3_i32 v3, v3, v6, 0x7fff
+; GFX12-GI-NEXT: v_med3_i32 v7, 0xffff8000, v7, v6
+; GFX12-GI-NEXT: v_med3_i32 v2, 0xffff8000, v8, v6
+; GFX12-GI-NEXT: v_med3_i32 v8, 0xffff8000, v9, v6
+; GFX12-GI-NEXT: v_med3_i32 v5, 0xffff8000, v5, v6
+; GFX12-GI-NEXT: v_med3_i32 v3, 0xffff8000, v3, v6
; GFX12-GI-NEXT: v_mov_b16_e32 v0.h, v4.l
; GFX12-GI-NEXT: v_mov_b16_e32 v1.h, v7.l
; GFX12-GI-NEXT: v_mov_b16_e32 v2.h, v8.l
>From 662c0dc103e489c93fad22bfba48e8a56d40d37f Mon Sep 17 00:00:00 2001
From: Vang Thao <vang.thao at amd.com>
Date: Fri, 20 Feb 2026 19:55:05 -0800
Subject: [PATCH 4/7] Remove wip_match_opcode, add TODO for regression
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 2 +-
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 1 +
.../GlobalISel/inst-select-copy-scc-vcc.ll | 30 +-
.../AMDGPU/GlobalISel/regbankselect-mui.ll | 5 +-
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 10 +-
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 17 +-
llvm/test/CodeGen/AMDGPU/fmaximum.ll | 18 +-
llvm/test/CodeGen/AMDGPU/fminimum.ll | 18 +-
.../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 121 +++-----
llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll | 268 ++++++------------
10 files changed, 176 insertions(+), 314 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 4bc9a17b520fe..6c50eec6ed486 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -105,7 +105,7 @@ def copy_scc_vcc_matchdata : GIDefMatchData<"CopySccVccMatchInfo">;
def copy_scc_vcc : GICombineRule<
(defs root:$copy, copy_scc_vcc_matchdata:$matchinfo),
- (match (wip_match_opcode COPY):$copy,
+ (match (COPY $dst, $src):$copy,
[{ return matchCopySccVcc(*${copy}, ${matchinfo}); }]),
(apply [{ applyCopySccVcc(*${copy}, ${matchinfo}); }])>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 0829867262218..d5efb17e45afa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -505,6 +505,7 @@ bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const {
assert(MI.getOpcode() == AMDGPU::COPY);
+ // TODO: Add a heuristic to determine whether the combine is profitable.
Register VgprDst = MI.getOperand(0).getReg();
Register SgprSrc = MI.getOperand(1).getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
index 315b02edea075..2b1c9c68372db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
@@ -6,15 +6,15 @@
define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) {
; GFX7-LABEL: fcmp_uniform_select:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9
-; GFX7-NEXT: s_load_dword s3, s[4:5], 0xb
+; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
-; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_cmp_eq_f32_e64 s[4:5], s6, 0
-; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
-; GFX7-NEXT: s_cselect_b32 s3, s7, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s3
+; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: v_cmp_eq_f32_e64 vcc, s2, 0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -25,11 +25,11 @@ define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr add
; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0
-; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX8-NEXT: s_cselect_b32 s0, s1, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s6
+; GFX8-NEXT: v_cmp_eq_f32_e64 vcc, s0, 0
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -37,16 +37,14 @@ define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr add
; GFX11-LABEL: fcmp_uniform_select:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX11-NEXT: v_cmp_eq_f32_e64 s0, s0, 0
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11-NEXT: s_cselect_b32 s0, s1, s6
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s1, s0
; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_endpgm
%cmp = fcmp oeq float %a, 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
index fa280a852383b..3ddc94ac9308c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
@@ -209,10 +209,9 @@ define amdgpu_ps void @vcc_to_scc(float inreg %a, i32 inreg %b, i32 inreg %c, pt
;
; NEW_RBS-LABEL: vcc_to_scc:
; NEW_RBS: ; %bb.0:
+; NEW_RBS-NEXT: v_mov_b32_e32 v2, s2
; NEW_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0
-; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0
-; NEW_RBS-NEXT: s_cselect_b32 s0, s1, s2
-; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0
+; NEW_RBS-NEXT: v_cndmask_b32_e64 v2, v2, s1, s0
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
%vcc_to_scc = fcmp oeq float %a, 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 6b0fa2f947e33..17b966d8dce07 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -5402,15 +5402,15 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT: s_cselect_b32 s4, 1, 0
-; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: s_cselect_b32 s0, 0, s4
-; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 39960760a5961..cbdad6f32e2ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -5419,25 +5419,26 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v0
; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v5, vcc
; GFX6-NEXT: v_mov_b32_e32 v6, s2
-; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
; GFX6-NEXT: v_mov_b32_e32 v7, s3
; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc
; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc
-; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
+; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT: s_cselect_b32 s4, 1, 0
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT: s_cselect_b32 s5, 1, 0
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: s_cselect_b32 s0, s4, s5
-; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[2:3], 0
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: v_mov_b32_e32 v2, s0
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 471829186841d..fa8dee971a8e9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1049,17 +1049,17 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_max_f32_e32 v3, v1, v2
+; GFX9-GISEL-NEXT: v_max_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_readfirstlane_b32 s2, v4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX9-GISEL-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX9-GISEL-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-GISEL-NEXT: s_cselect_b32 s2, s2, 0x7fc00000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -1126,6 +1126,7 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7e00
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1135,11 +1136,10 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GFX9-GISEL-NEXT: v_readfirstlane_b32 s3, v2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-GISEL-NEXT: v_max_f16_e32 v2, s2, v1
+; GFX9-GISEL-NEXT: v_readfirstlane_b32 s3, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s2, v1
-; GFX9-GISEL-NEXT: v_readfirstlane_b32 s2, v2
-; GFX9-GISEL-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-GISEL-NEXT: s_cselect_b32 s2, s2, 0x7e00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index e851f1d2e586e..34d78ee93c974 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1049,17 +1049,17 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_min_f32_e32 v3, v1, v2
+; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-GISEL-NEXT: v_readfirstlane_b32 s2, v4
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX9-GISEL-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX9-GISEL-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-GISEL-NEXT: s_cselect_b32 s2, s2, 0x7fc00000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -1126,6 +1126,7 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7e00
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1135,11 +1136,10 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GFX9-GISEL-NEXT: v_readfirstlane_b32 s3, v2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-GISEL-NEXT: v_min_f16_e32 v2, s2, v1
+; GFX9-GISEL-NEXT: v_readfirstlane_b32 s3, v2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s2, v1
-; GFX9-GISEL-NEXT: v_readfirstlane_b32 s2, v2
-; GFX9-GISEL-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-GISEL-NEXT: s_cselect_b32 s2, s2, 0x7e00
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 76958e63d36c9..4cb38aaeae2c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -45,81 +45,40 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7GLISEL-NEXT: s_endpgm
;
-; GFX8SELDAG-LABEL: sgpr_isnan_f16:
-; GFX8SELDAG: ; %bb.0:
-; GFX8SELDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX8SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8SELDAG-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX8SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX8SELDAG-NEXT: flat_store_dword v[0:1], v2
-; GFX8SELDAG-NEXT: s_endpgm
-;
-; GFX8GLISEL-LABEL: sgpr_isnan_f16:
-; GFX8GLISEL: ; %bb.0:
-; GFX8GLISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX8GLISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GLISEL-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
-; GFX8GLISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GLISEL-NEXT: s_endpgm
-;
-; GFX9SELDAG-LABEL: sgpr_isnan_f16:
-; GFX9SELDAG: ; %bb.0:
-; GFX9SELDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX9SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9SELDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX9SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9SELDAG-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
-; GFX9SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
-; GFX9SELDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9SELDAG-NEXT: s_endpgm
-;
-; GFX9GLISEL-LABEL: sgpr_isnan_f16:
-; GFX9GLISEL: ; %bb.0:
-; GFX9GLISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX9GLISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GLISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GLISEL-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
-; GFX9GLISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX9GLISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GLISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GLISEL-NEXT: s_endpgm
-;
-; GFX10SELDAG-LABEL: sgpr_isnan_f16:
-; GFX10SELDAG: ; %bb.0:
-; GFX10SELDAG-NEXT: s_clause 0x1
-; GFX10SELDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX10SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX10SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10SELDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10SELDAG-NEXT: s_endpgm
-;
-; GFX10GLISEL-LABEL: sgpr_isnan_f16:
-; GFX10GLISEL: ; %bb.0:
-; GFX10GLISEL-NEXT: s_clause 0x1
-; GFX10GLISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX10GLISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10GLISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10GLISEL-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX10GLISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX10GLISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10GLISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX10GLISEL-NEXT: s_endpgm
+; GFX8CHECK-LABEL: sgpr_isnan_f16:
+; GFX8CHECK: ; %bb.0:
+; GFX8CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
+; GFX8CHECK-NEXT: s_endpgm
+;
+; GFX9CHECK-LABEL: sgpr_isnan_f16:
+; GFX9CHECK: ; %bb.0:
+; GFX9CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
+; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
+; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9CHECK-NEXT: s_endpgm
+;
+; GFX10CHECK-LABEL: sgpr_isnan_f16:
+; GFX10CHECK: ; %bb.0:
+; GFX10CHECK-NEXT: s_clause 0x1
+; GFX10CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: s_endpgm
;
; GFX11SELDAG-TRUE16-LABEL: sgpr_isnan_f16:
; GFX11SELDAG-TRUE16: ; %bb.0:
@@ -153,9 +112,7 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 0
; GFX11GLISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e32 vcc_lo, s2, v0.l
-; GFX11GLISEL-TRUE16-NEXT: s_cmp_lg_u32 vcc_lo, 0
-; GFX11GLISEL-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11GLISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11GLISEL-TRUE16-NEXT: s_endpgm
;
@@ -164,13 +121,11 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX11GLISEL-FAKE16-NEXT: s_clause 0x1
; GFX11GLISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11GLISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11GLISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11GLISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11GLISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX11GLISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11GLISEL-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11GLISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2
-; GFX11GLISEL-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX11GLISEL-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11GLISEL-FAKE16-NEXT: s_endpgm
%result = call i1 @llvm.is.fpclass.f16(half %x, i32 3)
%sext = sext i1 %result to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index fd7424ce05af4..639d2c136f128 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -33,116 +33,58 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
; GFX7GLISEL-NEXT: s_mov_b32 s2, -1
; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX7GLISEL-NEXT: v_cmp_class_f32_e64 s[4:5], s3, 3
-; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
-; GFX7GLISEL-NEXT: s_cselect_b32 s3, -1, 0
-; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7GLISEL-NEXT: s_endpgm
;
-; GFX8SELDAG-LABEL: sgpr_isnan_f32:
-; GFX8SELDAG: ; %bb.0:
-; GFX8SELDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX8SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8SELDAG-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX8SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX8SELDAG-NEXT: flat_store_dword v[0:1], v2
-; GFX8SELDAG-NEXT: s_endpgm
-;
-; GFX8GLISEL-LABEL: sgpr_isnan_f32:
-; GFX8GLISEL: ; %bb.0:
-; GFX8GLISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX8GLISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GLISEL-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
-; GFX8GLISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GLISEL-NEXT: s_endpgm
-;
-; GFX9SELDAG-LABEL: sgpr_isnan_f32:
-; GFX9SELDAG: ; %bb.0:
-; GFX9SELDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX9SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9SELDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX9SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9SELDAG-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
-; GFX9SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
-; GFX9SELDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9SELDAG-NEXT: s_endpgm
-;
-; GFX9GLISEL-LABEL: sgpr_isnan_f32:
-; GFX9GLISEL: ; %bb.0:
-; GFX9GLISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX9GLISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9GLISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GLISEL-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
-; GFX9GLISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX9GLISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GLISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GLISEL-NEXT: s_endpgm
-;
-; GFX10SELDAG-LABEL: sgpr_isnan_f32:
-; GFX10SELDAG: ; %bb.0:
-; GFX10SELDAG-NEXT: s_clause 0x1
-; GFX10SELDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX10SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX10SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10SELDAG-NEXT: v_cmp_class_f32_e64 s2, s2, 3
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10SELDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10SELDAG-NEXT: s_endpgm
-;
-; GFX10GLISEL-LABEL: sgpr_isnan_f32:
-; GFX10GLISEL: ; %bb.0:
-; GFX10GLISEL-NEXT: s_clause 0x1
-; GFX10GLISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX10GLISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10GLISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10GLISEL-NEXT: v_cmp_class_f32_e64 s2, s2, 3
-; GFX10GLISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX10GLISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10GLISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX10GLISEL-NEXT: s_endpgm
-;
-; GFX11SELDAG-LABEL: sgpr_isnan_f32:
-; GFX11SELDAG: ; %bb.0:
-; GFX11SELDAG-NEXT: s_clause 0x1
-; GFX11SELDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11SELDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11SELDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11SELDAG-NEXT: v_cmp_class_f32_e64 s2, s2, 3
-; GFX11SELDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11SELDAG-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11SELDAG-NEXT: s_endpgm
-;
-; GFX11GLISEL-LABEL: sgpr_isnan_f32:
-; GFX11GLISEL: ; %bb.0:
-; GFX11GLISEL-NEXT: s_clause 0x1
-; GFX11GLISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11GLISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11GLISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11GLISEL-NEXT: v_cmp_class_f32_e64 s2, s2, 3
-; GFX11GLISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11GLISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11GLISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11GLISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11GLISEL-NEXT: s_endpgm
+; GFX8CHECK-LABEL: sgpr_isnan_f32:
+; GFX8CHECK: ; %bb.0:
+; GFX8CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
+; GFX8CHECK-NEXT: s_endpgm
+;
+; GFX9CHECK-LABEL: sgpr_isnan_f32:
+; GFX9CHECK: ; %bb.0:
+; GFX9CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
+; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
+; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9CHECK-NEXT: s_endpgm
+;
+; GFX10CHECK-LABEL: sgpr_isnan_f32:
+; GFX10CHECK: ; %bb.0:
+; GFX10CHECK-NEXT: s_clause 0x1
+; GFX10CHECK-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: s_endpgm
+;
+; GFX11CHECK-LABEL: sgpr_isnan_f32:
+; GFX11CHECK: ; %bb.0:
+; GFX11CHECK-NEXT: s_clause 0x1
+; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3
+; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: s_endpgm
%result = call i1 @llvm.is.fpclass.f32(float %x, i32 3) ; nan
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
@@ -167,11 +109,9 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
; GFX7GLISEL: ; %bb.0:
; GFX7GLISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7GLISEL-NEXT: v_cmp_class_f64_e64 s[4:5], s[2:3], 3
+; GFX7GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
+; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3]
; GFX7GLISEL-NEXT: s_mov_b32 s2, -1
-; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
-; GFX7GLISEL-NEXT: s_cselect_b32 s3, -1, 0
-; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX7GLISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7GLISEL-NEXT: s_endpgm
@@ -194,79 +134,40 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
; GFX8GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GLISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX8GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GLISEL-NEXT: s_endpgm
;
-; GFX9SELDAG-LABEL: sgpr_isnan_f64:
-; GFX9SELDAG: ; %bb.0:
-; GFX9SELDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9SELDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX9SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9SELDAG-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
-; GFX9SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
-; GFX9SELDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9SELDAG-NEXT: s_endpgm
-;
-; GFX9GLISEL-LABEL: sgpr_isnan_f64:
-; GFX9GLISEL: ; %bb.0:
-; GFX9GLISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9GLISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
-; GFX9GLISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX9GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX9GLISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9GLISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX9GLISEL-NEXT: s_endpgm
-;
-; GFX10SELDAG-LABEL: sgpr_isnan_f64:
-; GFX10SELDAG: ; %bb.0:
-; GFX10SELDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX10SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10SELDAG-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10SELDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10SELDAG-NEXT: s_endpgm
-;
-; GFX10GLISEL-LABEL: sgpr_isnan_f64:
-; GFX10GLISEL: ; %bb.0:
-; GFX10GLISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10GLISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10GLISEL-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
-; GFX10GLISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX10GLISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10GLISEL-NEXT: global_store_dword v1, v0, s[0:1]
-; GFX10GLISEL-NEXT: s_endpgm
-;
-; GFX11SELDAG-LABEL: sgpr_isnan_f64:
-; GFX11SELDAG: ; %bb.0:
-; GFX11SELDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11SELDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11SELDAG-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
-; GFX11SELDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11SELDAG-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11SELDAG-NEXT: s_endpgm
-;
-; GFX11GLISEL-LABEL: sgpr_isnan_f64:
-; GFX11GLISEL: ; %bb.0:
-; GFX11GLISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11GLISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11GLISEL-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
-; GFX11GLISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11GLISEL-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11GLISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11GLISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11GLISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11GLISEL-NEXT: s_endpgm
+; GFX9CHECK-LABEL: sgpr_isnan_f64:
+; GFX9CHECK: ; %bb.0:
+; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
+; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
+; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9CHECK-NEXT: s_endpgm
+;
+; GFX10CHECK-LABEL: sgpr_isnan_f64:
+; GFX10CHECK: ; %bb.0:
+; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: s_endpgm
+;
+; GFX11CHECK-LABEL: sgpr_isnan_f64:
+; GFX11CHECK: ; %bb.0:
+; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
+; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: s_endpgm
%result = call i1 @llvm.is.fpclass.f64(double %x, i32 3) ; nan
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
@@ -1567,3 +1468,10 @@ declare <7 x i1> @llvm.is.fpclass.v7f32(<7 x float>, i32)
declare <8 x i1> @llvm.is.fpclass.v8f32(<8 x float>, i32)
declare <16 x i1> @llvm.is.fpclass.v16f32(<16 x float>, i32)
declare <2 x i1> @llvm.is.fpclass.v2f64(<2 x double>, i32)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10GLISEL: {{.*}}
+; GFX10SELDAG: {{.*}}
+; GFX11GLISEL: {{.*}}
+; GFX11SELDAG: {{.*}}
+; GFX9GLISEL: {{.*}}
+; GFX9SELDAG: {{.*}}
>From 9f569f553c107e2f8ed7a7da2a6325b9d10f7a7b Mon Sep 17 00:00:00 2001
From: Vang Thao <vang.thao at amd.com>
Date: Fri, 20 Feb 2026 20:06:16 -0800
Subject: [PATCH 5/7] Fix formatting
---
llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index d5efb17e45afa..e8731df27dfdc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -518,8 +518,7 @@ bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
MachineInstr *CondDef;
Register TrueReg, FalseReg;
if (!mi_match(SgprSrc, MRI,
- m_GISelect(m_MInstr(CondDef), m_Reg(TrueReg),
- m_Reg(FalseReg))))
+ m_GISelect(m_MInstr(CondDef), m_Reg(TrueReg), m_Reg(FalseReg))))
return false;
if (CondDef->getOpcode() != AMDGPU::G_AMDGPU_COPY_SCC_VCC)
>From 52a74f92269fbd64bcbdb1dc0f55f1691db5dc12 Mon Sep 17 00:00:00 2001
From: Vang Thao <vang.thao at amd.com>
Date: Tue, 24 Feb 2026 00:32:58 -0800
Subject: [PATCH 6/7] Add simple hasOneNonDBGUse check
---
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 13 ++-
.../regbankcombiner-copy-scc-vcc-select.mir | 90 ++++++++-----------
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 10 +--
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 17 ++--
4 files changed, 61 insertions(+), 69 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index e8731df27dfdc..48ba7904f5195 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -505,7 +505,6 @@ bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const {
assert(MI.getOpcode() == AMDGPU::COPY);
- // TODO: Add a heuristic to determine whether the combine is profitable.
Register VgprDst = MI.getOperand(0).getReg();
Register SgprSrc = MI.getOperand(1).getReg();
@@ -515,6 +514,18 @@ bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
if (!isVgprRegBank(VgprDst))
return false;
+ // FIXME: Handle s64 types.
+ if (MRI.getType(VgprDst) != LLT::scalar(32))
+ return false;
+
+ // TODO: Use a heuristic to determine when we should combine instead.
+ // Only combine when the G_SELECT result has a single use (this COPY).
+ // With multiple uses the SGPR G_SELECT cannot be erased after the
+ // transformation, so a new VGPR G_SELECT would be added on top of the
+ // existing SGPR one.
+ if (!MRI.hasOneNonDBGUse(SgprSrc))
+ return false;
+
MachineInstr *CondDef;
Register TrueReg, FalseReg;
if (!mi_match(SgprSrc, MRI,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-copy-scc-vcc-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-copy-scc-vcc-select.mir
index af31e6099f320..ec3c28786793e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-copy-scc-vcc-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-copy-scc-vcc-select.mir
@@ -16,12 +16,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[ICMP]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AMDGPU_COPY_SCC_VCC]](s32), [[C]], [[C1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
@@ -52,12 +50,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[ICMP]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AMDGPU_COPY_SCC_VCC]](s32), [[C]], [[C1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
@@ -160,12 +156,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[FCMP]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AMDGPU_COPY_SCC_VCC]](s32), [[C]], [[C1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[C]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
@@ -235,12 +229,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[IS_FPCLASS:%[0-9]+]]:vcc(s1) = G_IS_FPCLASS [[COPY]](s32), 3
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[IS_FPCLASS]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AMDGPU_COPY_SCC_VCC]](s32), [[C]], [[C1]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[IS_FPCLASS]](s1), [[C]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
%1:vcc(s1) = G_IS_FPCLASS %0, 3
@@ -270,12 +262,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[ICMP]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AMDGPU_COPY_SCC_VCC]](s32), [[C]], [[C1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
@@ -457,12 +447,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[ICMP]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AMDGPU_COPY_SCC_VCC]](s32), [[C]], [[C1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
@@ -491,12 +479,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[ICMP]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AMDGPU_COPY_SCC_VCC]](s32), [[C]], [[C1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
@@ -527,12 +513,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[ICMP]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AMDGPU_COPY_SCC_VCC]](s32), [[C]], [[C1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
@@ -599,12 +583,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
- ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[ICMP]](s1)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AMDGPU_COPY_SCC_VCC]](s32), [[C]], [[C1]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[C]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 17b966d8dce07..6b0fa2f947e33 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -5402,15 +5402,15 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GFX6-NEXT: s_cselect_b32 s0, 0, s4
+; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index cbdad6f32e2ff..39960760a5961 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -5419,26 +5419,25 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v0
; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v5, vcc
; GFX6-NEXT: v_mov_b32_e32 v6, s2
+; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
; GFX6-NEXT: v_mov_b32_e32 v7, s3
; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc
; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
-; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT: s_cselect_b32 s4, 1, 0
+; GFX6-NEXT: s_cselect_b32 s5, 1, 0
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: s_cselect_b32 s0, 1, 0
-; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[2:3], 0
-; GFX6-NEXT: v_mov_b32_e32 v1, s4
-; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT: s_cselect_b32 s0, s4, s5
+; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2
>From b2cc0a798488211f9c5b2d58c75ba79d1c4db4dc Mon Sep 17 00:00:00 2001
From: Vang Thao <vang.thao at amd.com>
Date: Wed, 25 Feb 2026 18:00:28 -0800
Subject: [PATCH 7/7] Add initial cost heuristic
---
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 134 +++++++-
.../GlobalISel/inst-select-copy-scc-vcc.ll | 30 +-
.../regbankcombiner-clamp-minmax-const.mir | 8 +-
.../regbankcombiner-copy-scc-vcc-select.ll | 285 +++++++++---------
.../regbankcombiner-fmed3-minmax-const.mir | 32 +-
.../AMDGPU/GlobalISel/regbankselect-mui.ll | 5 +-
llvm/test/CodeGen/AMDGPU/fmaximum.ll | 53 +---
llvm/test/CodeGen/AMDGPU/fminimum.ll | 53 +---
8 files changed, 339 insertions(+), 261 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 48ba7904f5195..f2b135325f2f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -63,7 +63,10 @@ class AMDGPURegBankCombinerImpl : public Combiner {
bool tryCombineAll(MachineInstr &I) const override;
bool isVgprRegBank(Register Reg) const;
+ bool isSgprRegBank(Register Reg) const;
Register getAsVgpr(Register Reg) const;
+ int getAsVgprCost(Register Reg) const;
+ bool copySGPRToVGPRIsFree(Register VgprDst) const;
struct MinMaxMedOpc {
unsigned Min, Max, Med;
@@ -142,6 +145,10 @@ bool AMDGPURegBankCombinerImpl::isVgprRegBank(Register Reg) const {
return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID;
}
+bool AMDGPURegBankCombinerImpl::isSgprRegBank(Register Reg) const {
+ return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
+}
+
Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
if (isVgprRegBank(Reg))
return Reg;
@@ -149,16 +156,30 @@ Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
const RegisterBank &VgprRB = RBI.getRegBank(AMDGPU::VGPRRegBankID);
// Build constants directly in VGPR instead of copying from SGPR.
- std::optional<ValueAndVReg> Val =
- getIConstantVRegValWithLookThrough(Reg, MRI);
- if (Val) {
- auto VgprCst = B.buildConstant(MRI.getType(Reg), Val->Value);
+ if (auto V = getIConstantVRegValWithLookThrough(Reg, MRI)) {
+ auto VgprCst = B.buildConstant(MRI.getType(Reg), V->Value);
MRI.setRegBank(VgprCst.getReg(0), VgprRB);
return VgprCst.getReg(0);
}
+ if (auto V = getFConstantVRegValWithLookThrough(Reg, MRI)) {
+ if (MRI.getType(Reg).getSizeInBits() >= 32) {
+ auto VgprCst = B.buildFConstant(MRI.getType(Reg), V->Value);
+ MRI.setRegBank(VgprCst.getReg(0), VgprRB);
+ return VgprCst.getReg(0);
+ }
+ }
+
+ // Look through READANYLANE: the VGPR source holds the same uniform value.
+ if (const MachineInstr *Def = MRI.getVRegDef(Reg)) {
+ if (Def->getOpcode() == AMDGPU::G_AMDGPU_READANYLANE) {
+ Register Src = Def->getOperand(1).getReg();
+ if (isVgprRegBank(Src))
+ return Src;
+ }
+ }
// Search for existing copy of Reg to vgpr.
- for (MachineInstr &Use : MRI.use_instructions(Reg)) {
+ for (MachineInstr &Use : MRI.use_nodbg_instructions(Reg)) {
Register Def = Use.getOperand(0).getReg();
if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Def))
return Def;
@@ -170,6 +191,70 @@ Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
return VgprReg;
}
+// Return estimated cost of performing G_SELECT COPY_SCC_VCC combine
+int AMDGPURegBankCombinerImpl::getAsVgprCost(Register Reg) const {
+ if (isVgprRegBank(Reg))
+ return 0;
+
+ if (auto Val = getAnyConstantVRegValWithLookThrough(Reg, MRI)) {
+ // Inline constant creates no new instructions.
+ if (TII.isInlineConstant(Val->Value))
+ return 0;
+
+ // Non-inline constant requires a v_mov to materialise in VGPR. On GFX10+
+ // it fits as an instruction literal alongside VCC, so no extra instruction
+ // is needed.
+ return STI.getGeneration() >= AMDGPUSubtarget::GFX10 ? 0 : 1;
+ }
+
+ // We can eliminate a READANYLANE.
+ if (const MachineInstr *Def = MRI.getVRegDef(Reg);
+ Def && Def->getOpcode() == AMDGPU::G_AMDGPU_READANYLANE &&
+ isVgprRegBank(Def->getOperand(1).getReg()))
+ return -1;
+
+ // Check if we can use a pre-existing copy instead of creating a new one.
+ for (const MachineInstr &Use : MRI.use_nodbg_instructions(Reg)) {
+ if (Use.getOpcode() == AMDGPU::COPY &&
+ isVgprRegBank(Use.getOperand(0).getReg()))
+ return 0;
+ }
+
+ // Forced to create a new COPY.
+ return 1;
+}
+
+// Check if all consumers of VgprDst have a free constant-bus slot, which
+// would allow RA to coalesce the SGPR->VGPR COPY and read the SGPR source
+// directly.
+bool AMDGPURegBankCombinerImpl::copySGPRToVGPRIsFree(Register VgprDst) const {
+ const bool IsGFX10Plus = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
+ const bool Is64Bit = MRI.getType(VgprDst).getSizeInBits() == 64;
+
+ for (const MachineInstr &Use : MRI.use_nodbg_instructions(VgprDst)) {
+ unsigned Opc = Use.getOpcode();
+ if (Opc == TargetOpcode::G_PHI || Opc == TargetOpcode::G_STORE)
+ return false;
+
+ // Conservatively models GCNSubtarget::getConstantBusLimit: 64-bit shift
+ // instructions can use only one scalar value input even on GFX10+.
+ unsigned BusLimit = IsGFX10Plus ? 2 : 1;
+ if (IsGFX10Plus && Is64Bit &&
+ (Opc == TargetOpcode::G_SHL || Opc == TargetOpcode::G_LSHR ||
+ Opc == TargetOpcode::G_ASHR))
+ BusLimit = 1;
+
+ unsigned SGPRUses = 0;
+ for (const MachineOperand &MO : Use.explicit_operands())
+ if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual() &&
+ isSgprRegBank(MO.getReg()))
+ ++SGPRUses;
+ if (SGPRUses >= BusLimit)
+ return false;
+ }
+ return true;
+}
+
AMDGPURegBankCombinerImpl::MinMaxMedOpc
AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc) const {
switch (Opc) {
@@ -499,8 +584,12 @@ bool AMDGPURegBankCombinerImpl::applyD16Load(
}
// Eliminate VCC->SGPR->VGPR register bounce for uniform boolean in VCC.
-// Match: COPY (G_SELECT (G_AMDGPU_COPY_SCC_VCC %vcc), %true, %false)
-// Replace with: G_SELECT %vcc, %vgpr_true, %vgpr_false
+// Match:
+// %sgpr = G_AMDGPU_COPY_SCC_VCC %vcc
+// %sgpr2 = G_SELECT %sgpr, %true, %false
+// %vgpr = COPY %sgpr2
+// into:
+// %vgpr = G_SELECT %vcc, %vgpr_true, %vgpr_false
bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const {
assert(MI.getOpcode() == AMDGPU::COPY);
@@ -518,7 +607,6 @@ bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
if (MRI.getType(VgprDst) != LLT::scalar(32))
return false;
- // TODO: Use a heuristic to determine when we should combine instead.
// Only combine when the G_SELECT result has a single use (this COPY).
// With multiple uses the SGPR G_SELECT cannot be erased after the
// transformation, so a new VGPR G_SELECT would be added on top of the
@@ -535,6 +623,35 @@ bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
if (CondDef->getOpcode() != AMDGPU::G_AMDGPU_COPY_SCC_VCC)
return false;
+ // Combining COPY_SCC_VCC + SGPR G_SELECT + COPY, adds VALU G_SELECT.
+ // Base savings = 2 but if the COPY was free (RA coalesces it), then only 1
+ int BaseSavings = copySGPRToVGPRIsFree(VgprDst) ? 1 : 2;
+ int Cost = getAsVgprCost(TrueReg) + getAsVgprCost(FalseReg);
+
+ // Account for both True and False values being non-inline constants
+ auto TrueV = getAnyConstantVRegValWithLookThrough(TrueReg, MRI);
+ auto FalseV = getAnyConstantVRegValWithLookThrough(FalseReg, MRI);
+ if (TrueV && !TII.isInlineConstant(TrueV->Value) && FalseV &&
+ !TII.isInlineConstant(FalseV->Value)) {
+ // s_cselect requires a s_mov for the second non-inline constant so we
+ // can eliminate an additional s_mov.
+ ++BaseSavings;
+
+ // On GFX10, non-inline constants were counted as zero cost but if there are
+ // two non-inline constants then one will require v_mov.
+ if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
+ ++Cost;
+ }
+
+ if (Cost >= BaseSavings) {
+ LLVM_DEBUG(dbgs() << "matchCopySccVcc: not profitable (Cost=" << Cost
+ << " >= BaseSavings=" << BaseSavings << "), skipping "
+ << MI);
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "matchCopySccVcc: matched (Cost=" << Cost
+ << " < BaseSavings=" << BaseSavings << ") " << MI);
MatchInfo.VccReg = CondDef->getOperand(1).getReg();
MatchInfo.TrueReg = TrueReg;
MatchInfo.FalseReg = FalseReg;
@@ -543,6 +660,7 @@ bool AMDGPURegBankCombinerImpl::matchCopySccVcc(
void AMDGPURegBankCombinerImpl::applyCopySccVcc(
MachineInstr &MI, CopySccVccMatchInfo &MatchInfo) const {
+ LLVM_DEBUG(dbgs() << "applyCopySccVcc: applying to " << MI);
Register VgprDst = MI.getOperand(0).getReg();
Register VgprTrue = getAsVgpr(MatchInfo.TrueReg);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
index 2b1c9c68372db..315b02edea075 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
@@ -6,15 +6,15 @@
define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) {
; GFX7-LABEL: fcmp_uniform_select:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9
+; GFX7-NEXT: s_load_dword s3, s[4:5], 0xb
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_f32_e64 s[4:5], s6, 0
+; GFX7-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
+; GFX7-NEXT: s_cselect_b32 s3, s7, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: v_cmp_eq_f32_e64 vcc, s2, 0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@@ -25,11 +25,11 @@ define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr add
; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: v_cmp_eq_f32_e64 vcc, s0, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cselect_b32 s0, s1, s6
; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -37,14 +37,16 @@ define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr add
; GFX11-LABEL: fcmp_uniform_select:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX11-NEXT: v_cmp_eq_f32_e64 s0, s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s1, s0
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, s1, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_endpgm
%cmp = fcmp oeq float %a, 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir
index 70fd67363648d..3b048121fe51e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-minmax-const.mir
@@ -481,12 +481,10 @@ body: |
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:vgpr(s32) = G_FCANONICALIZE [[FMUL]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
- ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FCANONICALIZE]], [[COPY2]], [[COPY3]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 0.000000e+00
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FMED3 [[FCANONICALIZE]], [[C1]], [[C2]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-copy-scc-vcc-select.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-copy-scc-vcc-select.ll
index 59850786e1a9f..6bb0896950d64 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-copy-scc-vcc-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-copy-scc-vcc-select.ll
@@ -6,18 +6,16 @@ define amdgpu_ps void @test_fpclass_zext(float inreg %x, i32 %y, ptr addrspace(1
; GFX9-LABEL: test_fpclass_zext:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_cselect_b32 s0, 1, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, v3, v0
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test_fpclass_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX10-NEXT: s_cselect_b32 s0, 1, 0
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0
; GFX10-NEXT: global_store_dword v[1:2], v0, off
; GFX10-NEXT: s_endpgm
%cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
@@ -31,18 +29,16 @@ define amdgpu_ps void @test_fpclass_sext(float inreg %x, i32 %y, ptr addrspace(1
; GFX9-LABEL: test_fpclass_sext:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_cselect_b32 s0, -1, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, v3, v0
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test_fpclass_sext:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX10-NEXT: s_cselect_b32 s0, -1, 0
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0
; GFX10-NEXT: global_store_dword v[1:2], v0, off
; GFX10-NEXT: s_endpgm
%cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
@@ -134,9 +130,7 @@ define amdgpu_ps void @test_fpclass_select_fconst_fconst(float inreg %x, ptr add
; GCN-LABEL: test_fpclass_select_fconst_fconst:
; GCN: ; %bb.0:
; GCN-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GCN-NEXT: s_cselect_b32 s0, 1.0, -1.0
-; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_cndmask_b32_e64 v2, -1.0, 1.0, s[0:1]
; GCN-NEXT: global_store_dword v[0:1], v2, off
; GCN-NEXT: s_endpgm
%cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
@@ -145,6 +139,30 @@ define amdgpu_ps void @test_fpclass_select_fconst_fconst(float inreg %x, ptr add
ret void
}
+define amdgpu_ps void @test_fpclass_select_fconst_f16(half inreg %x, float inreg %y, ptr addrspace(1) %ptr) {
+; GFX9-LABEL: test_fpclass_select_fconst_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_add_f16_e64 v2, s2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s3, 3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: test_fpclass_select_fconst_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_f16_e64 v2, s2, s2
+; GFX10-NEXT: v_cmp_class_f32_e64 vcc, s3, 3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v2, vcc
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+ %res = fadd half %x, %x
+ %cond = call i1 @llvm.is.fpclass.f32(float %y, i32 3)
+ %sel = select i1 %cond, half %res, half 1.0
+ store half %sel, ptr addrspace(1) %ptr
+ ret void
+}
+
define amdgpu_ps void @test_fpclass_multiple_uses(float inreg %x, float inreg %y, float inreg %z, float inreg %a, float %b, ptr addrspace(1) %ptr) {
; GFX9-LABEL: test_fpclass_multiple_uses:
; GFX9: ; %bb.0:
@@ -247,14 +265,14 @@ define amdgpu_ps float @test_brcond(float inreg %x) {
; GCN-NEXT: s_cselect_b32 s0, 1, 0
; GCN-NEXT: s_xor_b32 s0, s0, 1
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cbranch_scc1 .LBB10_2
+; GCN-NEXT: s_cbranch_scc1 .LBB11_2
; GCN-NEXT: ; %bb.1: ; %if.true
; GCN-NEXT: v_mov_b32_e32 v0, 1.0
-; GCN-NEXT: s_branch .LBB10_3
-; GCN-NEXT: .LBB10_2: ; %if.false
+; GCN-NEXT: s_branch .LBB11_3
+; GCN-NEXT: .LBB11_2: ; %if.false
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_branch .LBB10_3
-; GCN-NEXT: .LBB10_3:
+; GCN-NEXT: s_branch .LBB11_3
+; GCN-NEXT: .LBB11_3:
entry:
%cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
br i1 %cond, label %if.true, label %if.false
@@ -265,101 +283,79 @@ if.false:
}
define amdgpu_ps void @test_fpclass_select_readanylane_f16(half inreg %x, float inreg %y, half inreg %z, ptr addrspace(1) %ptr) {
-; GCN-LABEL: test_fpclass_select_readanylane_f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_add_f16_e64 v2, s2, s2
-; GCN-NEXT: v_cmp_class_f32_e64 s[0:1], s3, 3
-; GCN-NEXT: v_readfirstlane_b32 s2, v2
-; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GCN-NEXT: s_cselect_b32 s0, s4, s2
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_short v[0:1], v2, off
-; GCN-NEXT: s_endpgm
- %res = fadd half %x, %x
- %cond = call i1 @llvm.is.fpclass.f32(float %y, i32 3)
- %sel = select i1 %cond, half %z, half %res
- store half %sel, ptr addrspace(1) %ptr
- ret void
-}
-
-define amdgpu_ps void @test_fpclass_select_readanylane2_f16(half inreg %x, half inreg %y, float inreg %z, ptr addrspace(1) %ptr) {
-; GFX9-LABEL: test_fpclass_select_readanylane2_f16:
+; GFX9-LABEL: test_fpclass_select_readanylane_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_add_f16_e64 v2, s2, s2
-; GFX9-NEXT: v_readfirstlane_b32 s2, v2
-; GFX9-NEXT: v_add_f16_e64 v2, s3, s3
-; GFX9-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3
-; GFX9-NEXT: v_readfirstlane_b32 s3, v2
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_cselect_b32 s0, s2, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s3, 3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
-; GFX10-LABEL: test_fpclass_select_readanylane2_f16:
+; GFX10-LABEL: test_fpclass_select_readanylane_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_f16_e64 v2, s2, s2
-; GFX10-NEXT: v_add_f16_e64 v3, s3, s3
-; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX10-NEXT: s_cselect_b32 s0, s2, s3
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s3, 3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s[0:1]
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
- %res1 = fadd half %x, %x
- %res2 = fadd half %y, %y
- %cond = call i1 @llvm.is.fpclass.f32(float %z, i32 3)
- %sel = select i1 %cond, half %res1, half %res2
+ %res = fadd half %x, %x
+ %cond = call i1 @llvm.is.fpclass.f32(float %y, i32 3)
+ %sel = select i1 %cond, half %z, half %res
store half %sel, ptr addrspace(1) %ptr
ret void
}
-define amdgpu_ps void @test_fpclass_select_readanylane_f32(float inreg %x, float inreg %y, float inreg %z, ptr addrspace(1) %ptr) {
-; GCN-LABEL: test_fpclass_select_readanylane_f32:
+define amdgpu_ps void @test_fpclass_select_readanylane2_f16(half inreg %x, half inreg %y, float inreg %z, ptr addrspace(1) %ptr) {
+; GCN-LABEL: test_fpclass_select_readanylane2_f16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_add_f32_e64 v2, s2, s2
-; GCN-NEXT: v_cmp_class_f32_e64 s[0:1], s3, 3
-; GCN-NEXT: v_readfirstlane_b32 s2, v2
-; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GCN-NEXT: s_cselect_b32 s0, s4, s2
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: v_add_f16_e64 v2, s2, s2
+; GCN-NEXT: v_add_f16_e64 v3, s3, s3
+; GCN-NEXT: v_cmp_class_f32_e64 vcc, s4, 3
+; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GCN-NEXT: global_store_short v[0:1], v2, off
; GCN-NEXT: s_endpgm
- %res = fadd float %x, %x
- %cond = call i1 @llvm.is.fpclass.f32(float %y, i32 3)
- %sel = select i1 %cond, float %z, float %res
- store float %sel, ptr addrspace(1) %ptr
+ %res1 = fadd half %x, %x
+ %res2 = fadd half %y, %y
+ %cond = call i1 @llvm.is.fpclass.f32(float %z, i32 3)
+ %sel = select i1 %cond, half %res1, half %res2
+ store half %sel, ptr addrspace(1) %ptr
ret void
}
-define amdgpu_ps void @test_fpclass_select_readanylane2_f32(float inreg %x, float inreg %y, float inreg %z, ptr addrspace(1) %ptr) {
-; GFX9-LABEL: test_fpclass_select_readanylane2_f32:
+define amdgpu_ps void @test_fpclass_select_readanylane_f32(float inreg %x, float inreg %y, float inreg %z, ptr addrspace(1) %ptr) {
+; GFX9-LABEL: test_fpclass_select_readanylane_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_add_f32_e64 v2, s2, s2
-; GFX9-NEXT: v_readfirstlane_b32 s2, v2
-; GFX9-NEXT: v_add_f32_e64 v2, s3, s3
-; GFX9-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3
-; GFX9-NEXT: v_readfirstlane_b32 s3, v2
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_cselect_b32 s0, s2, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s3, 3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
-; GFX10-LABEL: test_fpclass_select_readanylane2_f32:
+; GFX10-LABEL: test_fpclass_select_readanylane_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_f32_e64 v2, s2, s2
-; GFX10-NEXT: v_add_f32_e64 v3, s3, s3
-; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX10-NEXT: s_cselect_b32 s0, s2, s3
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s3, 3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s[0:1]
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
+ %res = fadd float %x, %x
+ %cond = call i1 @llvm.is.fpclass.f32(float %y, i32 3)
+ %sel = select i1 %cond, float %z, float %res
+ store float %sel, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_ps void @test_fpclass_select_readanylane2_f32(float inreg %x, float inreg %y, float inreg %z, ptr addrspace(1) %ptr) {
+; GCN-LABEL: test_fpclass_select_readanylane2_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_f32_e64 v2, s2, s2
+; GCN-NEXT: v_add_f32_e64 v3, s3, s3
+; GCN-NEXT: v_cmp_class_f32_e64 vcc, s4, 3
+; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: s_endpgm
%res1 = fadd float %x, %x
%res2 = fadd float %y, %y
%cond = call i1 @llvm.is.fpclass.f32(float %z, i32 3)
@@ -446,14 +442,22 @@ define amdgpu_ps void @test_fpclass_select_readanylane2_f64(double inreg %x, dou
}
define amdgpu_ps void @test_select_literal_imm_gfx10plus(float inreg %x, float %y, ptr addrspace(1) %ptr) {
-; GCN-LABEL: test_select_literal_imm_gfx10plus:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GCN-NEXT: s_cselect_b32 s0, 0x42c80000, 0
-; GCN-NEXT: v_add_f32_e32 v0, s0, v0
-; GCN-NEXT: global_store_dword v[1:2], v0, off
-; GCN-NEXT: s_endpgm
+; GFX9-LABEL: test_select_literal_imm_gfx10plus:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 0x42c80000, 0
+; GFX9-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: test_select_literal_imm_gfx10plus:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 0x42c80000, s[0:1]
+; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX10-NEXT: global_store_dword v[1:2], v0, off
+; GFX10-NEXT: s_endpgm
%cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
%sel = select i1 %cond, float 100.0, float 0.0
%add = fadd float %sel, %y
@@ -462,15 +466,24 @@ define amdgpu_ps void @test_select_literal_imm_gfx10plus(float inreg %x, float %
}
define amdgpu_ps void @test_select_two_literal_imm(float inreg %x, float %y, ptr addrspace(1) %ptr) {
-; GCN-LABEL: test_select_two_literal_imm:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GCN-NEXT: s_mov_b32 s0, 0x42c80000
-; GCN-NEXT: s_cselect_b32 s0, s0, 0x43480000
-; GCN-NEXT: v_add_f32_e32 v0, s0, v0
-; GCN-NEXT: global_store_dword v[1:2], v0, off
-; GCN-NEXT: s_endpgm
+; GFX9-LABEL: test_select_two_literal_imm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_mov_b32 s0, 0x42c80000
+; GFX9-NEXT: s_cselect_b32 s0, s0, 0x43480000
+; GFX9-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: test_select_two_literal_imm:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v3, 0x43480000
+; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0x42c80000, s[0:1]
+; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX10-NEXT: global_store_dword v[1:2], v0, off
+; GFX10-NEXT: s_endpgm
%cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
%sel = select i1 %cond, float 100.0, float 200.0
%add = fadd float %sel, %y
@@ -479,40 +492,36 @@ define amdgpu_ps void @test_select_two_literal_imm(float inreg %x, float %y, ptr
}
define amdgpu_ps void @test_select_two_literals_store(float inreg %x, ptr addrspace(1) %ptr) {
-; GCN-LABEL: test_select_two_literals_store:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GCN-NEXT: s_movk_i32 s0, 0x64
-; GCN-NEXT: s_cselect_b32 s0, s0, 0xc8
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: global_store_dword v[0:1], v2, off
-; GCN-NEXT: s_endpgm
- %cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
- %sel = select i1 %cond, i32 100, i32 200
- store i32 %sel, ptr addrspace(1) %ptr
- ret void
-}
-
-define amdgpu_ps void @test_float_select_combine_inline_const(float inreg %x, float inreg %w, ptr addrspace(1) %ptr) {
-; GFX9-LABEL: test_float_select_combine_inline_const:
+; GFX9-LABEL: test_select_two_literals_store:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_cselect_b32 s0, 0, 1.0
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mul_f32_e32 v2, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x64
+; GFX9-NEXT: v_mov_b32_e32 v3, 0xc8
+; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s2, 3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
-; GFX10-LABEL: test_float_select_combine_inline_const:
+; GFX10-LABEL: test_select_two_literals_store:
; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v2, 0xc8
; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX10-NEXT: s_cselect_b32 s0, 0, 1.0
-; GFX10-NEXT: v_mul_f32_e64 v2, s0, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0x64, s[0:1]
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
+ %cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
+ %sel = select i1 %cond, i32 100, i32 200
+ store i32 %sel, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_ps void @test_float_select_combine_inline_const(float inreg %x, float inreg %w, ptr addrspace(1) %ptr) {
+; GCN-LABEL: test_float_select_combine_inline_const:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
+; GCN-NEXT: v_cndmask_b32_e64 v2, 1.0, 0, s[0:1]
+; GCN-NEXT: v_mul_f32_e32 v2, s3, v2
+; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: s_endpgm
%cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
%sel = select i1 %cond, float 0.0, float 1.0
%mul = fmul float %sel, %w
@@ -520,14 +529,15 @@ define amdgpu_ps void @test_float_select_combine_inline_const(float inreg %x, fl
ret void
}
+; TODO: Improve combine heuristic for GFX9, this is a net 0 gain.
define amdgpu_ps void @test_select_preexisting_copy(float inreg %x, float inreg %y, float %z, ptr addrspace(1) %ptr) {
; GFX9-LABEL: test_select_preexisting_copy:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s2, 3
; GFX9-NEXT: v_add_f32_e32 v0, s3, v0
-; GFX9-NEXT: s_cselect_b32 s0, s3, 0
-; GFX9-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT: v_add_f32_e32 v0, v3, v0
; GFX9-NEXT: global_store_dword v[1:2], v0, off
; GFX9-NEXT: s_endpgm
;
@@ -535,9 +545,8 @@ define amdgpu_ps void @test_select_preexisting_copy(float inreg %x, float inreg
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cmp_class_f32_e64 s[0:1], s2, 3
; GFX10-NEXT: v_add_f32_e32 v0, s3, v0
-; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX10-NEXT: s_cselect_b32 s0, s3, 0
-; GFX10-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, s3, s[0:1]
+; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
; GFX10-NEXT: global_store_dword v[1:2], v0, off
; GFX10-NEXT: s_endpgm
%cond = call i1 @llvm.is.fpclass.f32(float %x, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir
index 2f41d86100040..fea21857db52b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-fmed3-minmax-const.mir
@@ -17,11 +17,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 2.000000e+00
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 4.000000e+00
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
@@ -49,11 +47,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY1]], [[COPY2]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 2.000000e+00
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 4.000000e+00
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
@@ -156,11 +152,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 2.000000e+00
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 4.000000e+00
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 4.000000e+00
@@ -188,11 +182,9 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 4.000000e+00
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[COPY2]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 2.000000e+00
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_FCONSTANT float 4.000000e+00
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[COPY]], [[C]], [[C1]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 4.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
index 3ddc94ac9308c..fa280a852383b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
@@ -209,9 +209,10 @@ define amdgpu_ps void @vcc_to_scc(float inreg %a, i32 inreg %b, i32 inreg %c, pt
;
; NEW_RBS-LABEL: vcc_to_scc:
; NEW_RBS: ; %bb.0:
-; NEW_RBS-NEXT: v_mov_b32_e32 v2, s2
; NEW_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0
-; NEW_RBS-NEXT: v_cndmask_b32_e64 v2, v2, s1, s0
+; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0
+; NEW_RBS-NEXT: s_cselect_b32 s0, s1, s2
+; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0
; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off
; NEW_RBS-NEXT: s_endpgm
%vcc_to_scc = fcmp oeq float %a, 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index fa8dee971a8e9..833239d24191a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1027,41 +1027,22 @@ define amdgpu_ps <8 x float> @test_fmaximum_v4f64_ss(<4 x double> inreg %a, <4 x
}
define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GFX9-SDAG-LABEL: fmaximumi_f32_move_to_valu:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[6:7] glc
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_max_f32_e32 v4, v1, v2
-; GFX9-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT: s_endpgm
-;
-; GFX9-GISEL-LABEL: fmaximumi_f32_move_to_valu:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[6:7] glc
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_max_f32_e32 v4, v1, v2
-; GFX9-GISEL-NEXT: v_readfirstlane_b32 s2, v4
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-GISEL-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT: s_endpgm
+; GFX9-LABEL: fmaximumi_f32_move_to_valu:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v4, v1, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: fmaximumi_f32_move_to_valu:
; GFX12-SDAG: ; %bb.0:
@@ -1136,8 +1117,6 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GFX9-GISEL-NEXT: v_readfirstlane_b32 s3, v2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-GISEL-NEXT: v_max_f16_e32 v2, s2, v1
-; GFX9-GISEL-NEXT: v_readfirstlane_b32 s3, v2
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s2, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 34d78ee93c974..b5835ea6aff31 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1027,41 +1027,22 @@ define amdgpu_ps <8 x float> @test_fminimum_v4f64_ss(<4 x double> inreg %a, <4 x
}
define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GFX9-SDAG-LABEL: fminimumi_f32_move_to_valu:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[6:7] glc
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_min_f32_e32 v4, v1, v2
-; GFX9-SDAG-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT: s_endpgm
-;
-; GFX9-GISEL-LABEL: fminimumi_f32_move_to_valu:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[6:7] glc
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
-; GFX9-GISEL-NEXT: v_readfirstlane_b32 s2, v4
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-GISEL-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-GISEL-NEXT: s_endpgm
+; GFX9-LABEL: fminimumi_f32_move_to_valu:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
+; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
;
; GFX12-SDAG-LABEL: fminimumi_f32_move_to_valu:
; GFX12-SDAG: ; %bb.0:
@@ -1136,8 +1117,6 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GFX9-GISEL-NEXT: v_readfirstlane_b32 s3, v2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-GISEL-NEXT: v_min_f16_e32 v2, s2, v1
-; GFX9-GISEL-NEXT: v_readfirstlane_b32 s3, v2
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-GISEL-NEXT: v_cmp_o_f16_e32 vcc, s2, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
More information about the llvm-branch-commits
mailing list