[llvm] [PowerPC] Check value uses in ValueBit tracking (PR #66040)

Mon Sep 11 19:54:13 PDT 2023

https://github.com/ecnelises created https://github.com/llvm/llvm-project/pull/66040:

PowerPC instruction selection tracks bits for every series of bitwise
operations. But in some cases the method selects worse result for complex
PowerPC rotate instructions. Check use of visited operands to avoid suboptimal
analysis.

>From a5d5588093af53689d63b47f497247d534a4ef5b Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan at cn.ibm.com>
Date: Tue, 12 Sep 2023 10:39:55 +0800
Subject: [PATCH] [PowerPC] Check value uses in ValueBit tracking

---
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   | 162 +++++++++++-------
 llvm/test/CodeGen/PowerPC/int128_ldst.ll      |  18 +-
 .../PowerPC/loop-instr-form-prepare.ll        |   6 +-
 llvm/test/CodeGen/PowerPC/prefer-dqform.ll    |   4 +-
 llvm/test/CodeGen/PowerPC/rldimi.ll           |  51 ++++++
 5 files changed, 163 insertions(+), 78 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/rldimi.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 61a3138007db900..8d7926a4444abc1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -1629,30 +1629,41 @@ class BitPermutationSelector {
     bool &Interesting = ValueEntry->first;
     SmallVector<ValueBit, 64> &Bits = ValueEntry->second;
     Bits.resize(NumBits);
+    SDValue LHS = V.getNumOperands() > 0 ? V.getOperand(0) : SDValue();
+    SDValue RHS = V.getNumOperands() > 1 ? V.getOperand(1) : SDValue();
 
     switch (V.getOpcode()) {
     default: break;
     case ISD::ROTL:
-      if (isa<ConstantSDNode>(V.getOperand(1))) {
+      if (isa<ConstantSDNode>(RHS)) {
         unsigned RotAmt = V.getConstantOperandVal(1);
 
-        const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
-
-        for (unsigned i = 0; i < NumBits; ++i)
-          Bits[i] = LHSBits[i < RotAmt ? i + (NumBits - RotAmt) : i - RotAmt];
+        if (LHS.hasOneUse()) {
+          const auto &LHSBits = *getValueBits(LHS, NumBits).second;
+          for (unsigned i = 0; i < NumBits; ++i)
+            Bits[i] = LHSBits[i < RotAmt ? i + (NumBits - RotAmt) : i - RotAmt];
+        } else {
+          for (unsigned i = 0; i < NumBits; ++i)
+            Bits[i] =
+                ValueBit(LHS, i < RotAmt ? i + (NumBits - RotAmt) : i - RotAmt);
+        }
 
         return std::make_pair(Interesting = true, &Bits);
       }
       break;
     case ISD::SHL:
     case PPCISD::SHL:
-      if (isa<ConstantSDNode>(V.getOperand(1))) {
+      if (isa<ConstantSDNode>(RHS)) {
         unsigned ShiftAmt = V.getConstantOperandVal(1);
 
-        const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
-
-        for (unsigned i = ShiftAmt; i < NumBits; ++i)
-          Bits[i] = LHSBits[i - ShiftAmt];
+        if (LHS.hasOneUse()) {
+          const auto &LHSBits = *getValueBits(LHS, NumBits).second;
+          for (unsigned i = ShiftAmt; i < NumBits; ++i)
+            Bits[i] = LHSBits[i - ShiftAmt];
+        } else {
+          for (unsigned i = ShiftAmt; i < NumBits; ++i)
+            Bits[i] = ValueBit(LHS, i - ShiftAmt);
+        }
 
         for (unsigned i = 0; i < ShiftAmt; ++i)
           Bits[i] = ValueBit(ValueBit::ConstZero);
@@ -1662,13 +1673,17 @@ class BitPermutationSelector {
       break;
     case ISD::SRL:
     case PPCISD::SRL:
-      if (isa<ConstantSDNode>(V.getOperand(1))) {
+      if (isa<ConstantSDNode>(RHS)) {
         unsigned ShiftAmt = V.getConstantOperandVal(1);
 
-        const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
-
-        for (unsigned i = 0; i < NumBits - ShiftAmt; ++i)
-          Bits[i] = LHSBits[i + ShiftAmt];
+        if (LHS.hasOneUse()) {
+          const auto &LHSBits = *getValueBits(LHS, NumBits).second;
+          for (unsigned i = 0; i < NumBits - ShiftAmt; ++i)
+            Bits[i] = LHSBits[i + ShiftAmt];
+        } else {
+          for (unsigned i = 0; i < NumBits - ShiftAmt; ++i)
+            Bits[i] = ValueBit(LHS, i + ShiftAmt);
+        }
 
         for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i)
           Bits[i] = ValueBit(ValueBit::ConstZero);
@@ -1677,23 +1692,27 @@ class BitPermutationSelector {
       }
       break;
     case ISD::AND:
-      if (isa<ConstantSDNode>(V.getOperand(1))) {
+      if (isa<ConstantSDNode>(RHS)) {
         uint64_t Mask = V.getConstantOperandVal(1);
 
-        const SmallVector<ValueBit, 64> *LHSBits;
+        const SmallVector<ValueBit, 64> *LHSBits = nullptr;
         // Mark this as interesting, only if the LHS was also interesting. This
         // prevents the overall procedure from matching a single immediate 'and'
         // (which is non-optimal because such an and might be folded with other
         // things if we don't select it here).
-        std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0), NumBits);
+        if (LHS.hasOneUse())
+          std::tie(Interesting, LHSBits) = getValueBits(LHS, NumBits);
 
         for (unsigned i = 0; i < NumBits; ++i)
-          if (((Mask >> i) & 1) == 1)
-            Bits[i] = (*LHSBits)[i];
-          else {
+          if (((Mask >> i) & 1) == 1) {
+            if (LHS.hasOneUse())
+              Bits[i] = (*LHSBits)[i];
+            else
+              Bits[i] = ValueBit(LHS, i);
+          } else {
             // AND instruction masks this bit. If the input is already zero,
             // we have nothing to do here. Otherwise, make the bit ConstZero.
-            if ((*LHSBits)[i].isZero())
+            if (LHS.hasOneUse() && (*LHSBits)[i].isZero())
               Bits[i] = (*LHSBits)[i];
             else
               Bits[i] = ValueBit(ValueBit::ConstZero);
@@ -1703,34 +1722,44 @@ class BitPermutationSelector {
       }
       break;
     case ISD::OR: {
-      const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
-      const auto &RHSBits = *getValueBits(V.getOperand(1), NumBits).second;
+      const auto *LHSBits =
+          LHS.hasOneUse() ? getValueBits(LHS, NumBits).second : nullptr;
+      const auto *RHSBits =
+          RHS.hasOneUse() ? getValueBits(RHS, NumBits).second : nullptr;
 
       bool AllDisjoint = true;
       SDValue LastVal = SDValue();
       unsigned LastIdx = 0;
       for (unsigned i = 0; i < NumBits; ++i) {
-        if (LHSBits[i].isZero() && RHSBits[i].isZero()) {
+        if (LHSBits && RHSBits && (*LHSBits)[i].isZero() &&
+            (*RHSBits)[i].isZero()) {
           // If both inputs are known to be zero and one is ConstZero and
           // another is VariableKnownToBeZero, we can select whichever
           // we like. To minimize the number of bit groups, we select
           // VariableKnownToBeZero if this bit is the next bit of the same
           // input variable from the previous bit. Otherwise, we select
           // ConstZero.
-          if (LHSBits[i].hasValue() && LHSBits[i].getValue() == LastVal &&
-              LHSBits[i].getValueBitIndex() == LastIdx + 1)
-            Bits[i] = LHSBits[i];
-          else if (RHSBits[i].hasValue() && RHSBits[i].getValue() == LastVal &&
-                   RHSBits[i].getValueBitIndex() == LastIdx + 1)
-            Bits[i] = RHSBits[i];
+          const auto &LBits = *LHSBits;
+          const auto &RBits = *RHSBits;
+          if (LBits[i].hasValue() && LBits[i].getValue() == LastVal &&
+              LBits[i].getValueBitIndex() == LastIdx + 1)
+            Bits[i] = LBits[i];
+          else if (RBits[i].hasValue() && RBits[i].getValue() == LastVal &&
+                   RBits[i].getValueBitIndex() == LastIdx + 1)
+            Bits[i] = RBits[i];
           else
             Bits[i] = ValueBit(ValueBit::ConstZero);
-        }
-        else if (LHSBits[i].isZero())
-          Bits[i] = RHSBits[i];
-        else if (RHSBits[i].isZero())
-          Bits[i] = LHSBits[i];
-        else {
+        } else if (LHSBits && (*LHSBits)[i].isZero()) {
+          if (RHSBits)
+            Bits[i] = (*RHSBits)[i];
+          else
+            Bits[i] = ValueBit(RHS, i);
+        } else if (RHSBits && (*RHSBits)[i].isZero()) {
+          if (LHSBits)
+            Bits[i] = (*LHSBits)[i];
+          else
+            Bits[i] = ValueBit(LHS, i);
+        } else {
           AllDisjoint = false;
           break;
         }
@@ -1738,9 +1767,9 @@ class BitPermutationSelector {
         if (Bits[i].hasValue()) {
           LastVal = Bits[i].getValue();
           LastIdx = Bits[i].getValueBitIndex();
-        }
-        else {
-          if (LastVal) LastVal = SDValue();
+        } else {
+          if (LastVal)
+            LastVal = SDValue();
           LastIdx = 0;
         }
       }
@@ -1752,17 +1781,19 @@ class BitPermutationSelector {
     }
     case ISD::ZERO_EXTEND: {
       // We support only the case with zero extension from i32 to i64 so far.
-      if (V.getValueType() != MVT::i64 ||
-          V.getOperand(0).getValueType() != MVT::i32)
+      if (V.getValueType() != MVT::i64 || LHS.getValueType() != MVT::i32)
         break;
 
-      const SmallVector<ValueBit, 64> *LHSBits;
       const unsigned NumOperandBits = 32;
-      std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0),
-                                                    NumOperandBits);
-
-      for (unsigned i = 0; i < NumOperandBits; ++i)
-        Bits[i] = (*LHSBits)[i];
+      if (LHS.hasOneUse()) {
+        const SmallVector<ValueBit, 64> *LHSBits;
+        std::tie(Interesting, LHSBits) = getValueBits(LHS, NumOperandBits);
+        for (unsigned i = 0; i < NumOperandBits; ++i)
+          Bits[i] = (*LHSBits)[i];
+      } else {
+        for (unsigned i = 0; i < NumOperandBits; ++i)
+          Bits[i] = ValueBit(LHS, i);
+      }
 
       for (unsigned i = NumOperandBits; i < NumBits; ++i)
         Bits[i] = ValueBit(ValueBit::ConstZero);
@@ -1770,15 +1801,14 @@ class BitPermutationSelector {
       return std::make_pair(Interesting, &Bits);
     }
     case ISD::TRUNCATE: {
-      EVT FromType = V.getOperand(0).getValueType();
+      EVT FromType = LHS.getValueType();
       EVT ToType = V.getValueType();
       // We support only the case with truncate from i64 to i32.
-      if (FromType != MVT::i64 || ToType != MVT::i32)
+      if (FromType != MVT::i64 || ToType != MVT::i32 || !LHS.hasOneUse())
         break;
       const unsigned NumAllBits = FromType.getSizeInBits();
       SmallVector<ValueBit, 64> *InBits;
-      std::tie(Interesting, InBits) = getValueBits(V.getOperand(0),
-                                                    NumAllBits);
+      std::tie(Interesting, InBits) = getValueBits(LHS, NumAllBits);
       const unsigned NumValidBits = ToType.getSizeInBits();
 
       // A 32-bit instruction cannot touch upper 32-bit part of 64-bit value.
@@ -1801,22 +1831,28 @@ class BitPermutationSelector {
       // For AssertZext, we look through the operand and
       // mark the bits known to be zero.
       const SmallVector<ValueBit, 64> *LHSBits;
-      std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0),
-                                                    NumBits);
 
-      EVT FromType = cast<VTSDNode>(V.getOperand(1))->getVT();
+      EVT FromType = cast<VTSDNode>(RHS)->getVT();
       const unsigned NumValidBits = FromType.getSizeInBits();
-      for (unsigned i = 0; i < NumValidBits; ++i)
-        Bits[i] = (*LHSBits)[i];
 
       // These bits are known to be zero but the AssertZext may be from a value
       // that already has some constant zero bits (i.e. from a masking and).
-      for (unsigned i = NumValidBits; i < NumBits; ++i)
-        Bits[i] = (*LHSBits)[i].hasValue()
-                      ? ValueBit((*LHSBits)[i].getValue(),
-                                 (*LHSBits)[i].getValueBitIndex(),
-                                 ValueBit::VariableKnownToBeZero)
-                      : ValueBit(ValueBit::ConstZero);
+      if (LHS.hasOneUse()) {
+        std::tie(Interesting, LHSBits) = getValueBits(LHS, NumBits);
+        for (unsigned i = 0; i < NumValidBits; ++i)
+          Bits[i] = (*LHSBits)[i];
+        for (unsigned i = NumValidBits; i < NumBits; ++i)
+          Bits[i] = (*LHSBits)[i].hasValue()
+                        ? ValueBit((*LHSBits)[i].getValue(),
+                                   (*LHSBits)[i].getValueBitIndex(),
+                                   ValueBit::VariableKnownToBeZero)
+                        : ValueBit(ValueBit::ConstZero);
+      } else {
+        for (unsigned i = 0; i < NumValidBits; ++i)
+          Bits[i] = ValueBit(LHS, i);
+        for (unsigned i = NumValidBits; i < NumBits; ++i)
+          Bits[i] = ValueBit(LHS, i, ValueBit::VariableKnownToBeZero);
+      }
 
       return std::make_pair(Interesting, &Bits);
     }
diff --git a/llvm/test/CodeGen/PowerPC/int128_ldst.ll b/llvm/test/CodeGen/PowerPC/int128_ldst.ll
index 7f5f6a181c1b01c..b9afca4a892fe30 100644
--- a/llvm/test/CodeGen/PowerPC/int128_ldst.ll
+++ b/llvm/test/CodeGen/PowerPC/int128_ldst.ll
@@ -208,11 +208,10 @@ entry:
 define dso_local i128 @ld_or2___int128___int128(i64 %ptr, i8 zeroext %off) {
 ; CHECK-LABEL: ld_or2___int128___int128:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rldicr 5, 3, 0, 51
-; CHECK-NEXT:    rotldi 6, 3, 52
-; CHECK-NEXT:    ldx 3, 5, 4
-; CHECK-NEXT:    rldimi 4, 6, 12, 0
-; CHECK-NEXT:    ld 4, 8(4)
+; CHECK-NEXT:    rldicr 3, 3, 0, 51
+; CHECK-NEXT:    or 5, 3, 4
+; CHECK-NEXT:    ldx 3, 3, 4
+; CHECK-NEXT:    ld 4, 8(5)
 ; CHECK-NEXT:    blr
 entry:
   %and = and i64 %ptr, -4096
@@ -740,11 +739,10 @@ entry:
 define dso_local void @st_or2__int128___int128(i64 %ptr, i8 zeroext %off, i128 %str) {
 ; CHECK-LABEL: st_or2__int128___int128:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rldicr 7, 3, 0, 51
-; CHECK-NEXT:    rotldi 3, 3, 52
-; CHECK-NEXT:    stdx 5, 7, 4
-; CHECK-NEXT:    rldimi 4, 3, 12, 0
-; CHECK-NEXT:    std 6, 8(4)
+; CHECK-NEXT:    rldicr 3, 3, 0, 51
+; CHECK-NEXT:    or 7, 3, 4
+; CHECK-NEXT:    stdx 5, 3, 4
+; CHECK-NEXT:    std 6, 8(7)
 ; CHECK-NEXT:    blr
 entry:
   %and = and i64 %ptr, -4096
diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
index 37baef60438847a..8a633787038f7be 100644
--- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
@@ -639,9 +639,9 @@ define i64 @test_ds_cross_basic_blocks(ptr %arg, i32 signext %arg1) {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lbzu r0, 1(r5)
 ; CHECK-NEXT:    mulli r29, r0, 171
-; CHECK-NEXT:    rlwinm r28, r29, 24, 8, 30
-; CHECK-NEXT:    srwi r29, r29, 9
-; CHECK-NEXT:    add r29, r29, r28
+; CHECK-NEXT:    srwi r28, r29, 9
+; CHECK-NEXT:    rlwinm r29, r29, 24, 8, 30
+; CHECK-NEXT:    add r29, r28, r29
 ; CHECK-NEXT:    sub r0, r0, r29
 ; CHECK-NEXT:    clrlwi r0, r0, 24
 ; CHECK-NEXT:    cmplwi r0, 1
diff --git a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll
index 912a74ba8df8fb5..4e57f2f3926a11c 100644
--- a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll
+++ b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll
@@ -35,7 +35,7 @@ define void @test(ptr dereferenceable(4) %.ial, ptr noalias dereferenceable(4) %
 ; CHECK-P9-NEXT:    addi r8, r5, -8
 ; CHECK-P9-NEXT:    lwz r5, 0(r7)
 ; CHECK-P9-NEXT:    extsw r7, r4
-; CHECK-P9-NEXT:    rldic r4, r3, 3, 29
+; CHECK-P9-NEXT:    sldi r4, r3, 3
 ; CHECK-P9-NEXT:    sub r3, r7, r3
 ; CHECK-P9-NEXT:    addi r10, r4, 8
 ; CHECK-P9-NEXT:    lxvdsx vs0, 0, r8
@@ -87,7 +87,7 @@ define void @test(ptr dereferenceable(4) %.ial, ptr noalias dereferenceable(4) %
 ; CHECK-P10-NEXT:    addi r8, r5, -8
 ; CHECK-P10-NEXT:    lwz r5, 0(r7)
 ; CHECK-P10-NEXT:    extsw r7, r4
-; CHECK-P10-NEXT:    rldic r4, r3, 3, 29
+; CHECK-P10-NEXT:    sldi r4, r3, 3
 ; CHECK-P10-NEXT:    addi r10, r4, 8
 ; CHECK-P10-NEXT:    sub r3, r7, r3
 ; CHECK-P10-NEXT:    lxvdsx vs0, 0, r8
diff --git a/llvm/test/CodeGen/PowerPC/rldimi.ll b/llvm/test/CodeGen/PowerPC/rldimi.ll
new file mode 100644
index 000000000000000..a37bf852499cbba
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/rldimi.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-ibm-aix -mcpu=pwr8 | FileCheck %s
+
+define i64 @rldimi1(i64 %a) {
+; CHECK-LABEL: rldimi1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rldimi 3, 3, 8, 0
+; CHECK-NEXT:    blr
+entry:
+  %x0 = shl i64 %a, 8
+  %x1 = and i64 %a, 255
+  %x2 = or i64 %x0, %x1
+  ret i64 %x2
+}
+
+define i64 @rldimi2(i64 %a) {
+; CHECK-LABEL: rldimi2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rldimi 3, 3, 8, 0
+; CHECK-NEXT:    rldimi 3, 3, 16, 0
+; CHECK-NEXT:    blr
+entry:
+  %x0 = shl i64 %a, 8
+  %x1 = and i64 %a, 255
+  %x2 = or i64 %x0, %x1
+  %x3 = shl i64 %x2, 16
+  %x4 = and i64 %x2, 65535
+  %x5 = or i64 %x3, %x4
+  ret i64 %x5
+}
+
+define i64 @rldimi3(i64 %a) {
+; CHECK-LABEL: rldimi3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rldimi 3, 3, 8, 0
+; CHECK-NEXT:    rldimi 3, 3, 16, 0
+; CHECK-NEXT:    rlwinm 3, 3, 0, 1, 0
+; CHECK-NEXT:    blr
+entry:
+  %0 = shl i64 %a, 8
+  %1 = and i64 %a, 255
+  %2 = or i64 %0, %1
+  %3 = shl i64 %2, 16
+  %4 = and i64 %2, 65535
+  %5 = or i64 %3, %4
+  %6 = shl i64 %5, 32
+  %7 = and i64 %5, 4294967295
+  %8 = or i64 %6, %7
+  ret i64 %8
+}