[llvm] [X86][FixupSetCC] Substitute setcc + zext pair with setzucc if possible (PR #96594)
Shengchen Kan via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 24 22:34:42 PDT 2024
https://github.com/KanRobert updated https://github.com/llvm/llvm-project/pull/96594
>From 820ba4a1bc1c542cf4a99a9ed241e587d3adf4ea Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Tue, 25 Jun 2024 13:24:13 +0800
Subject: [PATCH] [X86][FixupSetCC] Substitute setcc + zext pair with setzucc
when feature zu is available
---
llvm/lib/Target/X86/X86FixupSetCC.cpp | 35 +++++++++++++------
llvm/test/CodeGen/X86/apx/setzucc.ll | 49 +++++++++++++++++++++++++++
2 files changed, 73 insertions(+), 11 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/apx/setzucc.ll
diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 5c7105988070c..6713efea97fed 100644
--- a/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -17,6 +17,11 @@
// performed by the setcc. Instead, we can use:
// xor %eax, %eax; seta %al
// This both avoids the stall, and encodes shorter.
+//
+// Furthurmore, we can use:
+// setzua %al
+// if feature zero-upper is available. It's faster than the xor+setcc sequence.
+// When r16-r31 is used, it even encodes shorter.
//===----------------------------------------------------------------------===//
#include "X86.h"
@@ -46,6 +51,7 @@ class X86FixupSetCCPass : public MachineFunctionPass {
private:
MachineRegisterInfo *MRI = nullptr;
+ const X86Subtarget *ST = nullptr;
const X86InstrInfo *TII = nullptr;
enum { SearchBound = 16 };
@@ -61,7 +67,8 @@ FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
MRI = &MF.getRegInfo();
- TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+ ST = &MF.getSubtarget<X86Subtarget>();
+ TII = ST->getInstrInfo();
SmallVector<MachineInstr*, 4> ToErase;
@@ -79,7 +86,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
continue;
MachineInstr *ZExt = nullptr;
- for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))
+ Register Reg0 = MI.getOperand(0).getReg();
+ for (auto &Use : MRI->use_instructions(Reg0))
if (Use.getOpcode() == X86::MOVZX32rr8)
ZExt = &Use;
@@ -98,9 +106,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
continue;
// On 32-bit, we need to be careful to force an ABCD register.
- const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
- ? &X86::GR32RegClass
- : &X86::GR32_ABCDRegClass;
+ const TargetRegisterClass *RC =
+ ST->is64Bit() ? &X86::GR32RegClass : &X86::GR32_ABCDRegClass;
if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) {
// If we cannot constrain the register, we would need an additional copy
// and are better off keeping the MOVZX32rr8 we have now.
@@ -110,17 +117,23 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
++NumSubstZexts;
Changed = true;
- // Initialize a register with 0. This must go before the eflags def
+ // X86 setcc/setzucc only takes an output GR8, so fake a GR32 input by
+ // inserting the setcc/setzucc result into the low byte of the zeroed
+ // register.
Register ZeroReg = MRI->createVirtualRegister(RC);
- BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
- ZeroReg);
+ if (ST->hasZU()) {
+ MI.setDesc(TII->get(X86::SETZUCCr));
+ BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF), ZeroReg);
+ } else
+ // Initialize a register with 0. This must go before the eflags def
+ BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
+ ZeroReg);
- // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
- // the setcc result into the low byte of the zeroed register.
BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg())
.addReg(ZeroReg)
- .addReg(MI.getOperand(0).getReg())
+ .addReg(Reg0)
.addImm(X86::sub_8bit);
ToErase.push_back(ZExt);
}
diff --git a/llvm/test/CodeGen/X86/apx/setzucc.ll b/llvm/test/CodeGen/X86/apx/setzucc.ll
new file mode 100644
index 0000000000000..0cd560576739a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/setzucc.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -mattr=+zu | FileCheck %s
+
+define i16 @i8(i8 %x) nounwind {
+; CHECK-LABEL: i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: cmpb $3, %dil
+; CHECK-NEXT: setzuae %al
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %t0 = icmp ugt i8 %x, 2
+ %if = select i1 %t0, i8 1, i8 0
+ %zext = zext i8 %if to i16
+ ret i16 %zext
+}
+
+define i16 @i16(i16 %x) nounwind {
+; CHECK-LABEL: i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: cmpw $2, %di
+; CHECK-NEXT: setzub %al
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %t0 = icmp ult i16 %x, 2
+ %if = select i1 %t0, i16 1, i16 0
+ ret i16 %if
+}
+
+define i32 @i32(i32 %x) nounwind {
+; CHECK-LABEL: i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: cmpl $1, %edi
+; CHECK-NEXT: setzue %al
+; CHECK-NEXT: retq
+ %t0 = icmp eq i32 %x, 1
+ %if = select i1 %t0, i32 1, i32 0
+ ret i32 %if
+}
+
+define i64 @i64(i64 %x) nounwind {
+; CHECK-LABEL: i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: cmpq $1, %rdi
+; CHECK-NEXT: setzune %al
+; CHECK-NEXT: retq
+ %t0 = icmp ne i64 %x, 1
+ %if = select i1 %t0, i64 1, i64 0
+ ret i64 %if
+}
More information about the llvm-commits
mailing list