[llvm] [PPC] Add custom lowering for uaddo (PR #110137)

Thu Sep 26 09:04:15 PDT 2024

https://github.com/syzaara created https://github.com/llvm/llvm-project/pull/110137

Improve the codegen for uaddo node for i64 in 64-bit mode and i32 in 32-bit mode by custom lowering.

>From da682f23bc76ee40371a43cf09fe11738f03bab5 Mon Sep 17 00:00:00 2001
From: Zaara Syeda <syzaara at cpap8104.rtp.raleigh.ibm.com>
Date: Thu, 26 Sep 2024 11:49:54 -0400
Subject: [PATCH] [PPC] Add custom lowering for uaddo

Improve the codegen for uaddo node for i64 in 64-bit mode and
i32 in 32-bit mode by custom lowering.
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 43 +++++++++++++++
 llvm/lib/Target/PowerPC/PPCISelLowering.h   |  1 +
 llvm/lib/Target/PowerPC/PPCMIPeephole.cpp   | 61 +++++++++++++++++++--
 llvm/test/CodeGen/PowerPC/sat-add.ll        |  5 +-
 llvm/test/CodeGen/PowerPC/uaddo-32.ll       | 37 +++++++++++++
 llvm/test/CodeGen/PowerPC/uaddo-64.ll       | 37 +++++++++++++
 6 files changed, 175 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/uaddo-32.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/uaddo-64.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d9847a21489e63..c3497314b91e94 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -198,6 +198,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     }
   }
 
+  if (!Subtarget.hasP10Vector())
+    setOperationAction(ISD::UADDO, isPPC64 ? MVT::i64 : MVT::i32, Custom);
+
   // Match BITREVERSE to customized fast code sequence in the td file.
   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
@@ -11967,11 +11970,51 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("ERROR:Should return for all cases within swtich.");
 }
 
+SDValue PPCTargetLowering::LowerUaddo(SDValue Op, SelectionDAG &DAG) const {
+  // Default to target independent lowering if there is a logical user of the
+  // carry-bit.
+  for (SDNode *U : Op->uses()) {
+    if (U->getOpcode() == ISD::SELECT || ISD::isBitwiseLogicOp(U->getOpcode()))
+      return SDValue();
+  }
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  // Default to target independent lowering for special cases handled there.
+  if (isOneConstant(RHS) || isAllOnesConstant(RHS))
+    return SDValue();
+
+  EVT VT = Op.getNode()->getValueType(0);
+  bool is64Bit = Subtarget.isPPC64();
+
+  SDValue ADDC;
+  SDValue Overflow;
+  SDVTList VTs = Op.getNode()->getVTList();
+
+  ADDC = SDValue(DAG.getMachineNode(is64Bit ? PPC::ADDC8 : PPC::ADDC, dl, VT,
+                                    MVT::Glue, LHS, RHS),
+                 0);
+  SDValue Li = SDValue(DAG.getMachineNode(is64Bit ? PPC::LI8 : PPC::LI, dl, VT,
+                                          DAG.getTargetConstant(0, dl, VT)),
+                       0);
+  Overflow = SDValue(DAG.getMachineNode(is64Bit ? PPC::ADDZE8 : PPC::ADDZE, dl,
+                                        VT, MVT::Glue, Li, ADDC.getValue(1)),
+                     0);
+  SDValue OverflowTrunc =
+      DAG.getNode(ISD::TRUNCATE, dl, Op.getNode()->getValueType(1), Overflow);
+  SDValue Res =
+      DAG.getNode(ISD::MERGE_VALUES, dl, VTs, ADDC.getValue(0), OverflowTrunc);
+  return Res;
+}
+
 /// LowerOperation - Provide custom lowering hooks for some operations.
 ///
 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Wasn't expecting to be able to lower this!");
+  case ISD::UADDO:
+    return LowerUaddo(Op, DAG);
   case ISD::FPOW:               return lowerPow(Op, DAG);
   case ISD::FSIN:               return lowerSin(Op, DAG);
   case ISD::FCOS:               return lowerCos(Op, DAG);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 8907c3c5a81c3c..7285f6de4728d5 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1277,6 +1277,7 @@ namespace llvm {
     SDValue LowerGlobalTLSAddressLinux(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUaddo(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index b8abee76cdfa80..362eaf33ba96a8 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -139,6 +139,8 @@ struct PPCMIPeephole : public MachineFunctionPass {
   void UpdateTOCSaves(std::map<MachineInstr *, bool> &TOCSaves,
                       MachineInstr *MI);
 
+  bool eliminateTruncWhenLoweringUADDO(MachineInstr &MI,
+                                       MachineInstr *&ToErase);
   // A number of transformations will eliminate the definition of a register
   // as all of its uses will be removed. However, this leaves a register
   // without a definition for LiveVariables. Such transformations should
@@ -1071,6 +1073,18 @@ bool PPCMIPeephole::simplifyCode() {
         break;
       }
       case PPC::RLDICL: {
+        Register SrcReg = MI.getOperand(1).getReg();
+        if (!SrcReg.isVirtual())
+          break;
+
+        MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+        // We can eliminate clearing the left 63 bits when only the carry-bit is
+        // set.
+        if (eliminateTruncWhenLoweringUADDO(MI, ToErase)) {
+          Simplified = true;
+          break;
+        }
+
         // We can eliminate RLDICL (e.g. for zero-extension)
         // if all bits to clear are already zero in the input.
         // This code assume following code sequence for zero-extension.
@@ -1082,11 +1096,6 @@ bool PPCMIPeephole::simplifyCode() {
         if (MI.getOperand(2).getImm() != 0)
           break;
 
-        Register SrcReg = MI.getOperand(1).getReg();
-        if (!SrcReg.isVirtual())
-          break;
-
-        MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
         if (!(SrcMI && SrcMI->getOpcode() == PPC::INSERT_SUBREG &&
               SrcMI->getOperand(0).isReg() && SrcMI->getOperand(1).isReg()))
           break;
@@ -1277,7 +1286,15 @@ bool PPCMIPeephole::simplifyCode() {
         Simplified = true;
         break;
       }
-      case PPC::RLWINM:
+      case PPC::RLWINM: {
+        // We can eliminate clearing the left 31 bits when only the carry-bit is
+        // set.
+        if (eliminateTruncWhenLoweringUADDO(MI, ToErase)) {
+          Simplified = true;
+          break;
+        }
+      }
+        LLVM_FALLTHROUGH;
       case PPC::RLWINM_rec:
       case PPC::RLWINM8:
       case PPC::RLWINM8_rec: {
@@ -1889,6 +1906,38 @@ bool PPCMIPeephole::eliminateRedundantCompare() {
 
   return Simplified;
 }
+bool PPCMIPeephole::eliminateTruncWhenLoweringUADDO(MachineInstr &MI,
+                                                    MachineInstr *&ToErase) {
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (!SrcReg.isVirtual())
+    return false;
+  MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+
+  bool Is64Bit = MI.getOpcode() == PPC::RLDICL;
+  int Imm = Is64Bit ? 63 : 31;
+  if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != Imm)
+    return false;
+  if (SrcMI->getOpcode() != (Is64Bit ? PPC::ADDZE8 : PPC::ADDZE))
+    return false;
+  MachineInstr *LI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
+  if (LI->getOpcode() != (Is64Bit ? PPC::LI8 : PPC::LI))
+    return false;
+  if (LI->getOperand(1).getImm() != 0 || MI.getOperand(2).getImm() != 0)
+    return false;
+  Register NewReg = SrcMI->getOperand(0).getReg();
+  ToErase = &MI;
+  Register MIDestReg = MI.getOperand(0).getReg();
+  for (MachineInstr &UseMI : MRI->use_instructions(MIDestReg)) {
+    for (MachineOperand &MO : UseMI.operands()) {
+      if (MO.isReg() && MO.getReg() == MIDestReg) {
+        MO.setReg(NewReg);
+        addRegToUpdate(NewReg);
+        break;
+      }
+    }
+  }
+  return true;
+}
 
 // We miss the opportunity to emit an RLDIC when lowering jump tables
 // since ISEL sees only a single basic block. When selecting, the clear
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index f699ea54192d88..8fff2c28da245e 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -170,11 +170,10 @@ define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) {
 define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_notval:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li 5, -43
 ; CHECK-NEXT:    addi 4, 3, 42
-; CHECK-NEXT:    cmpld 3, 5
+; CHECK-NEXT:    cmpld 4, 3
 ; CHECK-NEXT:    li 3, -1
-; CHECK-NEXT:    iselgt 3, 3, 4
+; CHECK-NEXT:    isellt 3, 3, 4
 ; CHECK-NEXT:    blr
   %a = add i64 %x, 42
   %c = icmp ugt i64 %x, -43
diff --git a/llvm/test/CodeGen/PowerPC/uaddo-32.ll b/llvm/test/CodeGen/PowerPC/uaddo-32.ll
new file mode 100644
index 00000000000000..7c741e3618e6fc
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/uaddo-32.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s
+
+define noundef i32 @add(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) {
+; CHECK-LABEL: add:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    addc 3, 3, 4
+; CHECK-NEXT:    addze 4, 6
+; CHECK-NEXT:    stw 4, 0(5)
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %0, 1
+  %2 = extractvalue { i32, i1 } %0, 0
+  %3 = zext i1 %1 to i32
+  store i32 %3, ptr %ovf, align 8
+  ret i32 %2
+}
+
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32)
+
+define noundef zeroext i1 @add_overflow(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) {
+; CHECK-LABEL: add_overflow:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    addc 4, 3, 4
+; CHECK-NEXT:    addze 3, 6
+; CHECK-NEXT:    stw 4, 0(5)
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %0, 1
+  %2 = extractvalue { i32, i1 } %0, 0
+  store i32 %2, ptr %ovf, align 8
+  ret i1 %1
+}
diff --git a/llvm/test/CodeGen/PowerPC/uaddo-64.ll b/llvm/test/CodeGen/PowerPC/uaddo-64.ll
new file mode 100644
index 00000000000000..ef4eccd329d9f1
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/uaddo-64.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefixes=PPC64
+
+define noundef i64 @add(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) {
+; PPC64-LABEL: add:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    li 6, 0
+; PPC64-NEXT:    addc 3, 3, 4
+; PPC64-NEXT:    addze 4, 6
+; PPC64-NEXT:    std 4, 0(5)
+; PPC64-NEXT:    blr
+entry:
+  %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  %1 = extractvalue { i64, i1 } %0, 1
+  %2 = extractvalue { i64, i1 } %0, 0
+  %3 = zext i1 %1 to i64
+  store i64 %3, ptr %ovf, align 8
+  ret i64 %2
+}
+
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
+
+define noundef zeroext i1 @add_overflow(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) {
+; PPC64-LABEL: add_overflow:
+; PPC64:       # %bb.0: # %entry
+; PPC64-NEXT:    li 6, 0
+; PPC64-NEXT:    addc 4, 3, 4
+; PPC64-NEXT:    addze 3, 6
+; PPC64-NEXT:    std 4, 0(5)
+; PPC64-NEXT:    blr
+entry:
+  %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  %1 = extractvalue { i64, i1 } %0, 1
+  %2 = extractvalue { i64, i1 } %0, 0
+  store i64 %2, ptr %ovf, align 8
+  ret i1 %1
+}