[llvm] [X86][FixupSetCC] Substitute setcc + zext pair with setzucc if possible (PR #96594)

Mon Jun 24 22:34:42 PDT 2024

https://github.com/KanRobert updated https://github.com/llvm/llvm-project/pull/96594

>From 820ba4a1bc1c542cf4a99a9ed241e587d3adf4ea Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Tue, 25 Jun 2024 13:24:13 +0800
Subject: [PATCH] [X86][FixupSetCC] Substitute setcc + zext pair with setzucc
 when feature zu is available

---
 llvm/lib/Target/X86/X86FixupSetCC.cpp | 35 +++++++++++++------
 llvm/test/CodeGen/X86/apx/setzucc.ll  | 49 +++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/apx/setzucc.ll

diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 5c7105988070c..6713efea97fed 100644
--- a/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -17,6 +17,11 @@
 // performed by the setcc. Instead, we can use:
 // xor %eax, %eax; seta %al
 // This both avoids the stall, and encodes shorter.
+//
+// Furthurmore, we can use:
+// setzua %al
+// if feature zero-upper is available. It's faster than the xor+setcc sequence.
+// When r16-r31 is used, it even encodes shorter.
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
@@ -46,6 +51,7 @@ class X86FixupSetCCPass : public MachineFunctionPass {
 
 private:
   MachineRegisterInfo *MRI = nullptr;
+  const X86Subtarget *ST = nullptr;
   const X86InstrInfo *TII = nullptr;
 
   enum { SearchBound = 16 };
@@ -61,7 +67,8 @@ FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
 bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   MRI = &MF.getRegInfo();
-  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  ST = &MF.getSubtarget<X86Subtarget>();
+  TII = ST->getInstrInfo();
 
   SmallVector<MachineInstr*, 4> ToErase;
 
@@ -79,7 +86,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
         continue;
 
       MachineInstr *ZExt = nullptr;
-      for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))
+      Register Reg0 = MI.getOperand(0).getReg();
+      for (auto &Use : MRI->use_instructions(Reg0))
         if (Use.getOpcode() == X86::MOVZX32rr8)
           ZExt = &Use;
 
@@ -98,9 +106,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
         continue;
 
       // On 32-bit, we need to be careful to force an ABCD register.
-      const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
-                                          ? &X86::GR32RegClass
-                                          : &X86::GR32_ABCDRegClass;
+      const TargetRegisterClass *RC =
+          ST->is64Bit() ? &X86::GR32RegClass : &X86::GR32_ABCDRegClass;
       if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) {
         // If we cannot constrain the register, we would need an additional copy
         // and are better off keeping the MOVZX32rr8 we have now.
@@ -110,17 +117,23 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
       ++NumSubstZexts;
       Changed = true;
 
-      // Initialize a register with 0. This must go before the eflags def
+      // X86 setcc/setzucc only takes an output GR8, so fake a GR32 input by
+      // inserting the setcc/setzucc result into the low byte of the zeroed
+      // register.
       Register ZeroReg = MRI->createVirtualRegister(RC);
-      BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
-              ZeroReg);
+      if (ST->hasZU()) {
+        MI.setDesc(TII->get(X86::SETZUCCr));
+        BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
+                TII->get(TargetOpcode::IMPLICIT_DEF), ZeroReg);
+      } else
+        // Initialize a register with 0. This must go before the eflags def
+        BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
+                ZeroReg);
 
-      // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
-      // the setcc result into the low byte of the zeroed register.
       BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
               TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg())
           .addReg(ZeroReg)
-          .addReg(MI.getOperand(0).getReg())
+          .addReg(Reg0)
           .addImm(X86::sub_8bit);
       ToErase.push_back(ZExt);
     }
diff --git a/llvm/test/CodeGen/X86/apx/setzucc.ll b/llvm/test/CodeGen/X86/apx/setzucc.ll
new file mode 100644
index 0000000000000..0cd560576739a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/setzucc.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -mattr=+zu | FileCheck %s
+
+define i16 @i8(i8 %x) nounwind {
+; CHECK-LABEL: i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpb $3, %dil
+; CHECK-NEXT:    setzuae %al
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %t0 = icmp ugt i8 %x, 2
+  %if = select i1 %t0, i8 1, i8 0
+  %zext = zext i8 %if to i16
+  ret i16 %zext
+}
+
+define i16 @i16(i16 %x) nounwind {
+; CHECK-LABEL: i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpw $2, %di
+; CHECK-NEXT:    setzub %al
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %t0 = icmp ult i16 %x, 2
+  %if = select i1 %t0, i16 1, i16 0
+  ret i16 %if
+}
+
+define i32 @i32(i32 %x) nounwind {
+; CHECK-LABEL: i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    setzue %al
+; CHECK-NEXT:    retq
+  %t0 = icmp eq i32 %x, 1
+  %if = select i1 %t0, i32 1, i32 0
+  ret i32 %if
+}
+
+define i64 @i64(i64 %x) nounwind {
+; CHECK-LABEL: i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpq $1, %rdi
+; CHECK-NEXT:    setzune %al
+; CHECK-NEXT:    retq
+  %t0 = icmp ne i64 %x, 1
+  %if = select i1 %t0, i64 1, i64 0
+  ret i64 %if
+}