[llvm] [AVX-512] make vpternlogq more aggressive for longer chains of bitmanipulations (PR #189971)

Wed Apr 1 08:09:25 PDT 2026

https://github.com/raventid updated https://github.com/llvm/llvm-project/pull/189971

>From 4ec75e67835f852719a4a33064257097faec15ce Mon Sep 17 00:00:00 2001
From: raventid <juliankul at gmail.com>
Date: Tue, 24 Mar 2026 14:37:20 +0800
Subject: [PATCH 1/2] [vpternlog] optimize more aggressevely

---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 283 ++++++++++++++++--------
 llvm/test/CodeGen/X86/vpternlog.ll      |  78 ++++++-
 2 files changed, 266 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index be95168f2de00..1dc6c10aba4a6 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -32,6 +32,8 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include <cstdint>
+#include <functional>
+#include <optional>
 
 using namespace llvm;
 
@@ -4815,8 +4817,7 @@ bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
   return true;
 }
 
-// Try to match two logic ops to a VPTERNLOG.
-// FIXME: Handle more complex patterns that use an operand more than once?
+// Try to match logic trees to one or more VPTERNLOG operations.
 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   MVT NVT = N->getSimpleValueType(0);
 
@@ -4829,118 +4830,214 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
     return false;
 
-  auto getFoldableLogicOp = [](SDValue Op) {
-    // Peek through single use bitcast.
-    if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
-      Op = Op.getOperand(0);
+  auto IsLogicOpcode = [](unsigned Opc) {
+    return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
+           Opc == X86ISD::ANDNP;
+  };
+
+  auto IsAllOnesXor = [](SDValue V) {
+    return V.getOpcode() == ISD::XOR &&
+           ISD::isBuildVectorAllOnes(V.getOperand(1).getNode());
+  };
+
+  auto PeelSingleUseBitcast = [](SDValue V) {
+    if (V.getOpcode() == ISD::BITCAST && V.hasOneUse())
+      return V.getOperand(0);
+    return V;
+  };
+
+  // Avoid consuming OR into a stand-alone VPTERNLOG if it is part of a
+  // higher-level A & ~(B | C) shape. Let the parent AND/ANDNP matcher absorb
+  // the whole pattern instead.
+  if (N->getOpcode() == ISD::OR && N->hasOneUse()) {
+    SDNode *User = *N->user_begin();
+    while (User->getOpcode() == ISD::BITCAST && User->hasOneUse())
+      User = *User->user_begin();
+
+    if (User->getOpcode() == ISD::XOR && User->hasOneUse() &&
+        (ISD::isBuildVectorAllOnes(User->getOperand(0).getNode()) ||
+         ISD::isBuildVectorAllOnes(User->getOperand(1).getNode()))) {
+      SDNode *NextUser = *User->user_begin();
+      while (NextUser->getOpcode() == ISD::BITCAST && NextUser->hasOneUse())
+        NextUser = *NextUser->user_begin();
+      unsigned NextOpc = NextUser->getOpcode();
+      if (NextOpc == ISD::AND || NextOpc == X86ISD::ANDNP)
+        return false;
+    }
+  }
 
-    if (!Op.hasOneUse())
-      return SDValue();
+  // Fast-path: A & ~(B | C) -> vpternlog(A, B, C, 0x10)
+  if (N->getOpcode() == ISD::AND) {
+    for (unsigned Idx = 0; Idx != 2; ++Idx) {
+      SDValue NotSide = N->getOperand(Idx);
+      SDValue A = N->getOperand(Idx ^ 1);
 
-    unsigned Opc = Op.getOpcode();
-    if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
-        Opc == X86ISD::ANDNP)
-      return Op;
+      SDValue NotSideNoCast = PeelSingleUseBitcast(NotSide);
+      if (!NotSideNoCast.hasOneUse() || !IsAllOnesXor(NotSideNoCast))
+        continue;
 
-    return SDValue();
+      SDValue Inner = PeelSingleUseBitcast(NotSideNoCast.getOperand(0));
+      if (!Inner.hasOneUse() || Inner.getOpcode() != ISD::OR)
+        continue;
+
+      SDValue B = Inner.getOperand(0);
+      SDValue C = Inner.getOperand(1);
+      if (matchVPTERNLOG(N, N, Inner.getNode(), Inner.getNode(), A, B, C, 0x10))
+        return true;
+    }
+  }
+
+  struct LeafInfo {
+    SDValue Leaf;
+    SDNode *Parent;
+    uint8_t Magic;
   };
 
-  SDValue N0, N1, A, FoldableOp;
+  auto ComputeTernlog = [&](SDValue Root, SDNode *OpaqueSubtree,
+                            SmallVectorImpl<LeafInfo> &Leaves,
+                            uint8_t &ImmOut, bool &TooManyLeaves) {
+    TooManyLeaves = false;
 
-  // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
-  auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
-    if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
-        ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
-      SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
+    auto lookupLeaf = [&](SDValue Leaf) -> std::optional<uint8_t> {
+      for (const LeafInfo &L : Leaves)
+        if (L.Leaf == Leaf)
+          return L.Magic;
+      return std::nullopt;
+    };
 
-      if (!InnerOp)
-        return SDValue();
+    std::function<int(SDValue, SDNode *, bool)> ComputeRec =
+        [&](SDValue Op, SDNode *Parent, bool IsRoot) -> int {
+      if (Op.getNode() != OpaqueSubtree) {
+        // Peek through single-use bitcasts.
+        if (Op.getOpcode() == ISD::BITCAST && (IsRoot || Op.hasOneUse())) {
+          Parent = Op.getNode();
+          Op = Op.getOperand(0);
+        }
+
+        if ((IsRoot || Op.hasOneUse()) && IsAllOnesXor(Op)) {
+          int Inner = ComputeRec(Op.getOperand(0), Op.getNode(), false);
+          return Inner < 0 ? -1 : ((~Inner) & 0xFF);
+        }
 
-      N0 = InnerOp.getOperand(0);
-      N1 = InnerOp.getOperand(1);
-      if ((FoldableOp = getFoldableLogicOp(N1))) {
-        A = N0;
-        return InnerOp;
+        if ((IsRoot || Op.hasOneUse()) && IsLogicOpcode(Op.getOpcode())) {
+          int L = ComputeRec(Op.getOperand(0), Op.getNode(), false);
+          int R = ComputeRec(Op.getOperand(1), Op.getNode(), false);
+          if (L < 0 || R < 0)
+            return -1;
+
+          switch (Op.getOpcode()) {
+          default:
+            llvm_unreachable("Unexpected opcode");
+          case ISD::AND:
+            return (L & R) & 0xFF;
+          case ISD::OR:
+            return (L | R) & 0xFF;
+          case ISD::XOR:
+            return (L ^ R) & 0xFF;
+          case X86ISD::ANDNP:
+            return ((~L) & R) & 0xFF;
+          }
+        }
       }
-      if ((FoldableOp = getFoldableLogicOp(N0))) {
-        A = N1;
-        return InnerOp;
+
+      if (auto Existing = lookupLeaf(Op))
+        return *Existing;
+
+      if (Leaves.size() >= 3) {
+        TooManyLeaves = true;
+        return -1;
       }
-    }
-    return SDValue();
-  };
 
-  bool PeeledOuterNot = false;
-  SDNode *OriN = N;
-  if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
-    PeeledOuterNot = true;
-    N = InnerOp.getNode();
-  } else {
-    N0 = N->getOperand(0);
-    N1 = N->getOperand(1);
+      static constexpr uint8_t Magics[] = {0xF0, 0xCC, 0xAA};
+      uint8_t Magic = Magics[Leaves.size()];
+      Leaves.push_back({Op, Parent, Magic});
+      return Magic;
+    };
 
-    if ((FoldableOp = getFoldableLogicOp(N1)))
-      A = N0;
-    else if ((FoldableOp = getFoldableLogicOp(N0)))
-      A = N1;
-    else
+    int Imm = ComputeRec(Root, Root.getNode(), true);
+    if (Imm < 0)
       return false;
-  }
+    ImmOut = static_cast<uint8_t>(Imm & 0xFF);
+    return true;
+  };
+
+  auto EmitFromLeaves = [&](SDNode *Root,
+                            const SmallVectorImpl<LeafInfo> &InLeaves,
+                            uint8_t Imm) {
+    assert(!InLeaves.empty() && "Expected at least one leaf");
+    SDValue A = InLeaves[0].Leaf;
+    SDNode *ParentA = InLeaves[0].Parent;
+    SDValue B = A;
+    SDNode *ParentB = ParentA;
+    SDValue C = A;
+    SDNode *ParentC = ParentA;
 
-  SDValue B = FoldableOp.getOperand(0);
-  SDValue C = FoldableOp.getOperand(1);
-  SDNode *ParentA = N;
-  SDNode *ParentB = FoldableOp.getNode();
-  SDNode *ParentC = FoldableOp.getNode();
-
-  // We can build the appropriate control immediate by performing the logic
-  // operation we're matching using these constants for A, B, and C.
-  uint8_t TernlogMagicA = 0xf0;
-  uint8_t TernlogMagicB = 0xcc;
-  uint8_t TernlogMagicC = 0xaa;
-
-  // Some of the inputs may be inverted, peek through them and invert the
-  // magic values accordingly.
-  // TODO: There may be a bitcast before the xor that we should peek through.
-  auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
-    if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
-        ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
-      Magic = ~Magic;
-      Parent = Op.getNode();
-      Op = Op.getOperand(0);
+    if (InLeaves.size() > 1) {
+      B = InLeaves[1].Leaf;
+      ParentB = InLeaves[1].Parent;
     }
+    if (InLeaves.size() > 2) {
+      C = InLeaves[2].Leaf;
+      ParentC = InLeaves[2].Parent;
+    }
+
+    return matchVPTERNLOG(Root, ParentA, ParentB, ParentC, A, B, C, Imm);
   };
 
-  PeekThroughNot(A, ParentA, TernlogMagicA);
-  PeekThroughNot(B, ParentB, TernlogMagicB);
-  PeekThroughNot(C, ParentC, TernlogMagicC);
-
-  uint8_t Imm;
-  switch (FoldableOp.getOpcode()) {
-  default: llvm_unreachable("Unexpected opcode!");
-  case ISD::AND:      Imm = TernlogMagicB & TernlogMagicC; break;
-  case ISD::OR:       Imm = TernlogMagicB | TernlogMagicC; break;
-  case ISD::XOR:      Imm = TernlogMagicB ^ TernlogMagicC; break;
-  case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
+  SmallVector<LeafInfo, 3> Leaves;
+  uint8_t Imm = 0;
+  bool TooManyLeaves = false;
+  if (ComputeTernlog(SDValue(N, 0), /*OpaqueSubtree=*/nullptr, Leaves, Imm,
+                     TooManyLeaves)) {
+    if (Leaves.size() < 2)
+      return false;
+    return EmitFromLeaves(N, Leaves, Imm);
   }
 
-  switch (N->getOpcode()) {
-  default: llvm_unreachable("Unexpected opcode!");
-  case X86ISD::ANDNP:
-    if (A == N0)
-      Imm &= ~TernlogMagicA;
-    else
-      Imm = ~(Imm) & TernlogMagicA;
-    break;
-  case ISD::AND: Imm &= TernlogMagicA; break;
-  case ISD::OR:  Imm |= TernlogMagicA; break;
-  case ISD::XOR: Imm ^= TernlogMagicA; break;
-  }
+  // Generic cascading for >3 leaves: keep one direct root operand as an opaque
+  // input leaf, then fold the remaining logic around it. This allows
+  // multi-level trees to be selected as chained VPTERNLOG operations.
+  if (TooManyLeaves) {
+    auto IsGoodOpaqueCandidate = [&](SDValue V) {
+      SDValue P = PeelSingleUseBitcast(V);
+      if (IsAllOnesXor(P))
+        P = PeelSingleUseBitcast(P.getOperand(0));
+      return IsLogicOpcode(P.getOpcode());
+    };
+
+    SmallVector<unsigned, 2> CandidateOrder;
+    if (IsGoodOpaqueCandidate(N->getOperand(0)))
+      CandidateOrder.push_back(0);
+    if (IsGoodOpaqueCandidate(N->getOperand(1)))
+      CandidateOrder.push_back(1);
+    if (CandidateOrder.empty())
+      return false;
+
+    // Prefer single-use subtrees first; they are better cascading anchors.
+    if (CandidateOrder.size() == 2 &&
+        !N->getOperand(CandidateOrder[0]).hasOneUse() &&
+        N->getOperand(CandidateOrder[1]).hasOneUse())
+      std::swap(CandidateOrder[0], CandidateOrder[1]);
 
-  if (PeeledOuterNot)
-    Imm = ~Imm;
+    for (unsigned Idx : CandidateOrder) {
+      SmallVector<LeafInfo, 3> CascadedLeaves;
+      uint8_t CascadedImm = 0;
+      bool CascadedTooManyLeaves = false;
+      SDNode *OpaqueSubtree = N->getOperand(Idx).getNode();
 
-  return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
+      if (!ComputeTernlog(SDValue(N, 0), OpaqueSubtree, CascadedLeaves,
+                          CascadedImm, CascadedTooManyLeaves))
+        continue;
+
+      if (CascadedLeaves.size() < 2)
+        continue;
+
+      if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+        return true;
+    }
+  }
+
+  return false;
 }
 
 /// If the high bits of an 'and' operand are known zero, try setting the
diff --git a/llvm/test/CodeGen/X86/vpternlog.ll b/llvm/test/CodeGen/X86/vpternlog.ll
index bd7478d3a82d5..100d31883823d 100644
--- a/llvm/test/CodeGen/X86/vpternlog.ll
+++ b/llvm/test/CodeGen/X86/vpternlog.ll
@@ -4,7 +4,7 @@
 define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT:    vpternlogq $1
 ; CHECK-NEXT:    retq
   %and.demorgan = or <8 x i64> %b, %a
   %and3.demorgan = or <8 x i64> %and.demorgan, %c
@@ -15,7 +15,7 @@ define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
 define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) {
 ; CHECK-LABEL: xorbitcast:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT:    vpternlogq $1
 ; CHECK-NEXT:    retq
   %or1 = or <64 x i8> %a, %b
   %or2 = or <64 x i8> %or1, %c
@@ -23,3 +23,77 @@ define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) {
   %xor = xor <8 x i64> %cast, splat (i64 -1)
   ret <8 x i64> %xor
 }
+
+define <8 x i64> @foobar(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                         <8 x i64> %d, <8 x i64> %e) {
+; CHECK-LABEL: foobar:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq $208
+; CHECK-NEXT:    vpternlogq $252
+; CHECK-NEXT:    vpandnq
+; CHECK-NEXT:    retq
+  %nb = xor <8 x i64> %b, splat (i64 -1)
+  %or = or <8 x i64> %nb, %c
+  %foo = and <8 x i64> %or, %a
+  %de = or <8 x i64> %d, %e
+  %nde = xor <8 x i64> %de, splat (i64 -1)
+  %bar = and <8 x i64> %foo, %nde
+  ret <8 x i64> %bar
+}
+
+define <8 x i64> @or_not_and_guard(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                                   <8 x i64> %d) {
+; CHECK-LABEL: or_not_and_guard:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq $16
+; CHECK-NOT:   vpternlogq $252
+; CHECK:       retq
+  %or = or <8 x i64> %b, %c
+  %not_or = xor <8 x i64> %or, splat (i64 -1)
+  %lhs = and <8 x i64> %a, %not_or
+  %res = and <8 x i64> %lhs, %d
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @depth4_chain(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                               <8 x i64> %d, <8 x i64> %e) {
+; CHECK-LABEL: depth4_chain:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq
+; CHECK:       retq
+  %t0 = xor <8 x i64> %a, %b
+  %t1 = or <8 x i64> %t0, %c
+  %t2 = and <8 x i64> %t1, %d
+  %t3 = xor <8 x i64> %t2, %e
+  ret <8 x i64> %t3
+}
+
+define <8 x i64> @depth5_chain(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                               <8 x i64> %d, <8 x i64> %e, <8 x i64> %f) {
+; CHECK-LABEL: depth5_chain:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq
+; CHECK:       retq
+  %t0 = and <8 x i64> %a, %b
+  %t1 = xor <8 x i64> %t0, %c
+  %t2 = or <8 x i64> %t1, %d
+  %t3 = and <8 x i64> %t2, %e
+  %t4 = xor <8 x i64> %t3, %f
+  ret <8 x i64> %t4
+}
+
+define <8 x i64> @balanced_depth(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                                 <8 x i64> %d, <8 x i64> %e, <8 x i64> %f) {
+; CHECK-LABEL: balanced_depth:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq
+; CHECK:       vpternlogq
+; CHECK:       vpternlogq
+; CHECK:       retq
+  %l0 = or <8 x i64> %a, %b
+  %l1 = xor <8 x i64> %c, %d
+  %l2 = and <8 x i64> %l0, %l1
+  %r0 = xor <8 x i64> %e, %f
+  %res = or <8 x i64> %l2, %r0
+  ret <8 x i64> %res
+}

>From ecc91eba380937db5e8d1ae26913155f441ec792 Mon Sep 17 00:00:00 2001
From: raventid <juliankul at gmail.com>
Date: Wed, 1 Apr 2026 22:46:46 +0800
Subject: [PATCH 2/2] [AVX-512] make vpternlogq more aggressive for longer
 chains of bitmanipulations

---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp | 133 +++++++++++++
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp    | 221 ++++++++++++++++++++-
 llvm/test/CodeGen/X86/vpternlog.ll         | 123 +++++++++++-
 3 files changed, 469 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index af3c3af38e681..c766da744a581 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -21,6 +21,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86RegisterInfo.h"
@@ -37,8 +38,135 @@ using namespace llvm;
 #define DEBUG_TYPE "x86-fixup-inst-tuning"
 
 STATISTIC(NumInstChanges, "Number of instructions changes");
+STATISTIC(NumVPTERNLOGNotFolds,
+          "Number of all-ones + XOR-mem fused into VPTERNLOG NOT");
 
 namespace {
+
+/// Return the VPTERNLOG-rmi opcode for the given XOR-mem opcode, or 0 if there
+/// is no corresponding opcode.  We use the Q (64-bit element) variant for the
+/// VPTERNLOG so that the memory operand can be folded with the larger element
+/// granularity – element type is irrelevant for a bitwise NOT.
+static unsigned getVPTERNLOGForXORrm(unsigned XorOpc) {
+  switch (XorOpc) {
+  default:
+    return 0;
+  case X86::VPXORQZrm:
+  case X86::VPXORDZrm:
+    return X86::VPTERNLOGQZrmi;
+  case X86::VPXORQZ256rm:
+  case X86::VPXORDZ256rm:
+    return X86::VPTERNLOGQZ256rmi;
+  case X86::VPXORQZ128rm:
+  case X86::VPXORDZ128rm:
+    return X86::VPTERNLOGQZ128rmi;
+  }
+}
+
+/// Return true if \p MI is a VPTERNLOG-rri that materializes all-ones
+/// (immediate 0xFF) with all source operands marked as undef.
+static bool isVPTERNLOGAllOnes(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case X86::VPTERNLOGDZrri:
+  case X86::VPTERNLOGQZrri:
+  case X86::VPTERNLOGDZ256rri:
+  case X86::VPTERNLOGQZ256rri:
+  case X86::VPTERNLOGDZ128rri:
+  case X86::VPTERNLOGQZ128rri:
+    break;
+  }
+  // The last operand is the immediate; it must be 0xFF (all-ones).
+  const MachineOperand &ImmOp = MI.getOperand(MI.getNumOperands() - 1);
+  return ImmOp.isImm() && (ImmOp.getImm() & 0xFF) == 0xFF;
+}
+
+/// Try to fuse an all-ones materialization followed by a vector XOR-from-memory
+/// into a single VPTERNLOG NOT-from-memory:
+///
+///   $dst = VPTERNLOGDZrri undef $dst, undef $dst, undef $dst, 255
+///   $dst = VPXORQZrm      killed $dst, <mem>
+///
+/// becomes:
+///
+///   $dst = VPTERNLOGQZrmi  undef $dst, undef $dst, <mem>, 0x55
+///
+/// The immediate 0x55 = ~C (where C = src3 = memory operand), which is
+/// independent of src1 and src2.
+static bool tryFuseNotFromMem(const X86InstrInfo *TII, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &I) {
+  MachineInstr &XorMI = *I;
+  unsigned TernlogOpc = getVPTERNLOGForXORrm(XorMI.getOpcode());
+  if (!TernlogOpc)
+    return false;
+
+  // The XOR-rm layout: $dst(tied=$src1), $src1, base, scale, index, disp, seg.
+  Register DstReg = XorMI.getOperand(0).getReg();
+
+  // Walk backward to find the all-ones materialization.  Skip debug and
+  // position-independent instructions, but stop at any real instruction that
+  // touches DstReg.
+  MachineBasicBlock::iterator PrevIt = I;
+  if (PrevIt == MBB.begin())
+    return false;
+
+  MachineInstr *AllOnesMI = nullptr;
+  for (--PrevIt;; --PrevIt) {
+    MachineInstr &Prev = *PrevIt;
+
+    if (Prev.isDebugInstr()) {
+      if (PrevIt == MBB.begin())
+        return false;
+      continue;
+    }
+
+    if (isVPTERNLOGAllOnes(Prev) && Prev.getOperand(0).getReg() == DstReg) {
+      AllOnesMI = &Prev;
+      break;
+    }
+
+    // Any other instruction that reads or writes DstReg blocks the fold.
+    return false;
+  }
+
+  if (!AllOnesMI)
+    return false;
+
+  // Verify that the all-ones defines only DstReg and has no other users
+  // between itself and the XOR.  Since they are adjacent (modulo debug instrs)
+  // and both write DstReg, this is guaranteed.
+
+  LLVM_DEBUG(dbgs() << "Fusing VPTERNLOG NOT-from-memory:\n"
+                    << "  " << *AllOnesMI << "  " << XorMI);
+
+  // Build: $dst = VPTERNLOGQZrmi undef $dst, undef $dst, <mem>, 0x0F
+  // The XOR-rm operands: 0=dst, 1=src1, 2..6=mem(base,scale,index,disp,seg)
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, I, XorMI.getDebugLoc(), TII->get(TernlogOpc), DstReg)
+          .addReg(DstReg, RegState::Undef)  // src1 (tied, don't care)
+          .addReg(DstReg, RegState::Undef); // src2 (don't care)
+
+  // Copy the 5 memory addressing operands from the XOR.
+  for (unsigned J = 2; J < 2 + X86::AddrNumOperands; ++J)
+    MIB.add(XorMI.getOperand(J));
+
+  MIB.addImm(0x55); // imm = ~C where C = src3 = memory operand
+
+  // Preserve mem-refs from the XOR.
+  MIB.setMemRefs(XorMI.memoperands());
+
+  LLVM_DEBUG(dbgs() << "  -> " << *MIB);
+
+  // Erase the two old instructions.
+  AllOnesMI->eraseFromParent();
+  I = MIB.getInstr()->getIterator();
+  XorMI.eraseFromParent();
+
+  ++NumVPTERNLOGNotFolds;
+  return true;
+}
+
 class X86FixupInstTuningImpl {
 public:
   bool runOnMachineFunction(MachineFunction &MF);
@@ -683,6 +811,11 @@ bool X86FixupInstTuningImpl::runOnMachineFunction(MachineFunction &MF) {
 
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+      // Try fusing all-ones + XOR-mem → VPTERNLOG NOT-mem first.
+      if (tryFuseNotFromMem(TII, MBB, I)) {
+        Changed = true;
+        continue;
+      }
       if (processInstruction(MF, MBB, I)) {
         ++NumInstChanges;
         Changed = true;
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 1dc6c10aba4a6..17c7d7dfe92a5 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4835,6 +4835,10 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
            Opc == X86ISD::ANDNP;
   };
 
+  auto IsAllOnesVec = [](SDValue V) {
+    return ISD::isBuildVectorAllOnes(V.getNode());
+  };
+
   auto IsAllOnesXor = [](SDValue V) {
     return V.getOpcode() == ISD::XOR &&
            ISD::isBuildVectorAllOnes(V.getOperand(1).getNode());
@@ -4846,6 +4850,47 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
     return V;
   };
 
+  std::function<bool(SDValue, unsigned)> IsHomogeneousAssociativeTree =
+      [&](SDValue V, unsigned Opc) {
+        V = PeelSingleUseBitcast(V);
+        if (V.getOpcode() != Opc)
+          return true;
+        if (!V.hasOneUse())
+          return false;
+        return IsHomogeneousAssociativeTree(V.getOperand(0), Opc) &&
+               IsHomogeneousAssociativeTree(V.getOperand(1), Opc);
+      };
+
+  auto IsLoadLike = [](SDValue V) {
+    return isa<LoadSDNode>(V.getNode()) ||
+           V.getOpcode() == X86ISD::VBROADCAST_LOAD;
+  };
+
+  // Fast-path: X ^ -1 -> ~X.
+  //
+  // Use X for all three VPTERNLOG inputs and select an immediate that yields
+  // ~X when A == B == C == X (imm bit0 = 1, bit7 = 0; other bits are don't
+  // care). This avoids introducing undef register operands.
+  //
+  // Keep this for register-like X only. For load-like X, this can cause an
+  // extra move/load before a folded-load VPTERNLOG form, which is usually not
+  // profitable.
+  if (N->getOpcode() == ISD::XOR) {
+    for (unsigned Idx = 0; Idx != 2; ++Idx) {
+      if (!IsAllOnesVec(N->getOperand(Idx)))
+        continue;
+
+      SDValue X = N->getOperand(Idx ^ 1);
+      SDValue XNoCast = PeelSingleUseBitcast(X);
+      if (IsLogicOpcode(XNoCast.getOpcode()) || IsAllOnesXor(XNoCast) ||
+          IsLoadLike(XNoCast))
+        continue;
+
+      if (matchVPTERNLOG(N, N, N, N, X, X, X, 0x01))
+        return true;
+    }
+  }
+
   // Avoid consuming OR into a stand-alone VPTERNLOG if it is part of a
   // higher-level A & ~(B | C) shape. Let the parent AND/ANDNP matcher absorb
   // the whole pattern instead.
@@ -4866,6 +4911,26 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
     }
   }
 
+  // Avoid consuming xor(logic_op, -1) (i.e. NOT of a logic sub-tree) into a
+  // stand-alone VPTERNLOG when the parent is also a logic op.  The parent's
+  // ComputeTernlog will fold the NOT directly into the truth-table immediate,
+  // producing fewer instructions overall.
+  if (N->getOpcode() == ISD::XOR && N->hasOneUse() &&
+      (ISD::isBuildVectorAllOnes(N->getOperand(0).getNode()) ||
+       ISD::isBuildVectorAllOnes(N->getOperand(1).getNode()))) {
+    SDValue Inner = ISD::isBuildVectorAllOnes(N->getOperand(1).getNode())
+                        ? N->getOperand(0)
+                        : N->getOperand(1);
+    SDValue InnerNoCast = PeelSingleUseBitcast(Inner);
+    if (IsLogicOpcode(InnerNoCast.getOpcode())) {
+      SDNode *User = *N->user_begin();
+      while (User->getOpcode() == ISD::BITCAST && User->hasOneUse())
+        User = *User->user_begin();
+      if (IsLogicOpcode(User->getOpcode()))
+        return false;
+    }
+  }
+
   // Fast-path: A & ~(B | C) -> vpternlog(A, B, C, 0x10)
   if (N->getOpcode() == ISD::AND) {
     for (unsigned Idx = 0; Idx != 2; ++Idx) {
@@ -4894,8 +4959,8 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   };
 
   auto ComputeTernlog = [&](SDValue Root, SDNode *OpaqueSubtree,
-                            SmallVectorImpl<LeafInfo> &Leaves,
-                            uint8_t &ImmOut, bool &TooManyLeaves) {
+                            SmallVectorImpl<LeafInfo> &Leaves, uint8_t &ImmOut,
+                            bool &TooManyLeaves) {
     TooManyLeaves = false;
 
     auto lookupLeaf = [&](SDValue Leaf) -> std::optional<uint8_t> {
@@ -4989,7 +5054,13 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   bool TooManyLeaves = false;
   if (ComputeTernlog(SDValue(N, 0), /*OpaqueSubtree=*/nullptr, Leaves, Imm,
                      TooManyLeaves)) {
-    if (Leaves.size() < 2)
+    if (Leaves.empty())
+      return false;
+    // A single-leaf load folded into VPTERNLOG causes a redundant explicit
+    // load (for the tied src1=dst) plus a folded load, doubling memory
+    // traffic. Bail out and let default lowering handle it (e.g.
+    // SETALLONES + VPXORQ mem for NOT-of-load).
+    if (Leaves.size() == 1 && IsLoadLike(PeelSingleUseBitcast(Leaves[0].Leaf)))
       return false;
     return EmitFromLeaves(N, Leaves, Imm);
   }
@@ -4998,6 +5069,45 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   // input leaf, then fold the remaining logic around it. This allows
   // multi-level trees to be selected as chained VPTERNLOG operations.
   if (TooManyLeaves) {
+    bool IsAssocOp = N->getOpcode() == ISD::OR || N->getOpcode() == ISD::AND ||
+                     N->getOpcode() == ISD::XOR;
+    bool IsHomogeneousAssoc = IsAssocOp && IsHomogeneousAssociativeTree(
+                                               SDValue(N, 0), N->getOpcode());
+
+    // For homogeneous associative trees, prefer choosing an opaque subtree
+    // from one level below the root-side logic node so we can expose two fresh
+    // siblings and form 3-input VPTERNLOG combines (e.g. OR reductions as
+    // repeated imm=254), rather than creating long 2-input $252/$250 chains.
+    if (IsHomogeneousAssoc) {
+      for (unsigned RootIdx = 0; RootIdx != 2; ++RootIdx) {
+        SDValue Side = PeelSingleUseBitcast(N->getOperand(RootIdx));
+        if (!Side.hasOneUse() || Side.getOpcode() != N->getOpcode())
+          continue;
+
+        for (unsigned ChildIdx = 0; ChildIdx != 2; ++ChildIdx) {
+          SDValue Child = PeelSingleUseBitcast(Side.getOperand(ChildIdx));
+          if (!Child.hasOneUse() || Child.getOpcode() != N->getOpcode())
+            continue;
+
+          SmallVector<LeafInfo, 3> AssocLeaves;
+          uint8_t AssocImm = 0;
+          bool AssocTooManyLeaves = false;
+          if (!ComputeTernlog(SDValue(N, 0), Child.getNode(), AssocLeaves,
+                              AssocImm, AssocTooManyLeaves))
+            continue;
+
+          if (AssocLeaves.size() < 2)
+            continue;
+
+          if (EmitFromLeaves(N, AssocLeaves, AssocImm))
+            return true;
+        }
+      }
+      // Fall through to generic cascading — for balanced trees all
+      // grandchildren may be leaves, so the child-as-opaque strategy below
+      // can still produce a valid 3-input combine.
+    }
+
     auto IsGoodOpaqueCandidate = [&](SDValue V) {
       SDValue P = PeelSingleUseBitcast(V);
       if (IsAllOnesXor(P))
@@ -5019,6 +5129,25 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
         N->getOperand(CandidateOrder[1]).hasOneUse())
       std::swap(CandidateOrder[0], CandidateOrder[1]);
 
+    // Collect all opaque subtree candidates: direct children and, if a direct
+    // child is itself a logic op, its grandchildren.  Trying grandchildren
+    // allows the root VPTERNLOG to absorb more distinct operands (3 instead
+    // of 2), which produces tighter cascades.  Example:
+    //
+    //      xor              With child "and" opaque: 2 leaves (and, e)
+    //     /   \             With grandchild "or" opaque: 3 leaves (or, d, e)
+    //   and    e            → saves one instruction in the cascade.
+    //  /   \
+    // or    d
+    //
+    // Try direct children first.  Only explore grandchildren when all direct
+    // children produce ≤2 leaves (i.e. a degenerate 2-input fold that wastes
+    // a VPTERNLOG slot) AND the opaque subtree itself has >3 leaves, meaning
+    // a single VPTERNLOG cannot handle it. When the opaque child fits in one
+    // VPTERNLOG (≤3 leaves), going deeper just reshuffles the split without
+    // saving instructions.
+    bool TriedDirect = false;
+    bool NeedsGrandchild = false;
     for (unsigned Idx : CandidateOrder) {
       SmallVector<LeafInfo, 3> CascadedLeaves;
       uint8_t CascadedImm = 0;
@@ -5029,11 +5158,91 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
                           CascadedImm, CascadedTooManyLeaves))
         continue;
 
-      if (CascadedLeaves.size() < 2)
+      if (CascadedLeaves.empty())
         continue;
 
-      if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
-        return true;
+      // If the direct child yields a 3-leaf fold, emit it right away — this
+      // is already optimal for this level.
+      if (CascadedLeaves.size() == 3) {
+        if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+          return true;
+      }
+      TriedDirect = true;
+
+      // Check if the opaque subtree itself has >3 leaves: if so, it cannot
+      // be handled by a single VPTERNLOG, and going one level deeper may
+      // help reduce the total instruction count.
+      if (CascadedLeaves.size() <= 2) {
+        SmallVector<LeafInfo, 3> SubLeaves;
+        uint8_t SubImm = 0;
+        bool SubTooMany = false;
+        if (!ComputeTernlog(SDValue(N->getOperand(Idx)),
+                            /*OpaqueSubtree=*/nullptr, SubLeaves, SubImm,
+                            SubTooMany) &&
+            SubTooMany)
+          NeedsGrandchild = true;
+      }
+    }
+
+    // Direct children only yielded ≤2-leaf folds and the opaque subtree has
+    // >3 leaves (can't fit in one VPTERNLOG).  Try grandchildren — making a
+    // deeper subtree opaque exposes more leaves at the root level, reducing
+    // the total instruction count.
+    if (NeedsGrandchild) {
+      for (unsigned Idx : CandidateOrder) {
+        SDValue Child = N->getOperand(Idx);
+        SDValue ChildNoCast = PeelSingleUseBitcast(Child);
+        if (!ChildNoCast.hasOneUse() || !IsLogicOpcode(ChildNoCast.getOpcode()))
+          continue;
+
+        for (unsigned GIdx = 0; GIdx != 2; ++GIdx) {
+          SDValue GChild = ChildNoCast.getOperand(GIdx);
+          SDValue GChildNoCast = PeelSingleUseBitcast(GChild);
+
+          // Skip NOT-wrappers (xor X, -1): ComputeTernlog already folds NOT
+          // into the truth table, so cutting at a NOT boundary just pushes
+          // the NOT into a separate instruction without saving anything.
+          if (IsAllOnesXor(GChildNoCast))
+            continue;
+
+          if (!IsLogicOpcode(GChildNoCast.getOpcode()))
+            continue;
+
+          SmallVector<LeafInfo, 3> CascadedLeaves;
+          uint8_t CascadedImm = 0;
+          bool CascadedTooManyLeaves = false;
+
+          if (!ComputeTernlog(SDValue(N, 0), GChild.getNode(), CascadedLeaves,
+                              CascadedImm, CascadedTooManyLeaves))
+            continue;
+
+          if (CascadedLeaves.size() < 3)
+            continue;
+
+          if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+            return true;
+        }
+      }
+    }
+
+    // Fall back to direct-child opaque with ≤2 leaves if nothing else worked.
+    if (TriedDirect) {
+      for (unsigned Idx : CandidateOrder) {
+        SmallVector<LeafInfo, 3> CascadedLeaves;
+        uint8_t CascadedImm = 0;
+        bool CascadedTooManyLeaves = false;
+        SDNode *OpaqueSubtree = N->getOperand(Idx).getNode();
+
+        if (!ComputeTernlog(SDValue(N, 0), OpaqueSubtree, CascadedLeaves,
+                            CascadedImm, CascadedTooManyLeaves))
+          continue;
+
+        if (CascadedLeaves.empty())
+          continue;
+
+        if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+          return true;
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/vpternlog.ll b/llvm/test/CodeGen/X86/vpternlog.ll
index 100d31883823d..4f9e384bd85a0 100644
--- a/llvm/test/CodeGen/X86/vpternlog.ll
+++ b/llvm/test/CodeGen/X86/vpternlog.ll
@@ -29,8 +29,7 @@ define <8 x i64> @foobar(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
 ; CHECK-LABEL: foobar:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogq $208
-; CHECK-NEXT:    vpternlogq $252
-; CHECK-NEXT:    vpandnq
+; CHECK-NEXT:    vpternlogq $16
 ; CHECK-NEXT:    retq
   %nb = xor <8 x i64> %b, splat (i64 -1)
   %or = or <8 x i64> %nb, %c
@@ -97,3 +96,123 @@ define <8 x i64> @balanced_depth(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
   %res = or <8 x i64> %l2, %r0
   ret <8 x i64> %res
 }
+
+define <8 x i64> @flip(ptr %x) {
+; CHECK-LABEL: flip:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq $85, (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %ld = load <8 x i64>, ptr %x, align 64
+  %not = xor <8 x i64> %ld, splat (i64 -1)
+  ret <8 x i64> %not
+}
+
+define dso_local <8 x i64> @fubar(<8 x i64> %0, <8 x i64> %1, <8 x i64> %2,
+                                  <8 x i64> %3, <8 x i64> %4, <8 x i64> %5,
+                                  <8 x i64> %6, <8 x i64> %7, <8 x i64> %8)
+    local_unnamed_addr {
+; CHECK-LABEL: fubar:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK-NOT:   vpternlogq $252
+; CHECK-NOT:   vpternlogq $250
+; CHECK:       retq
+Entry:
+  %9 = or <8 x i64> %1, %0
+  %10 = or <8 x i64> %9, %2
+  %11 = or <8 x i64> %10, %3
+  %12 = or <8 x i64> %11, %4
+  %13 = or <8 x i64> %12, %5
+  %14 = or <8 x i64> %13, %6
+  %15 = or <8 x i64> %14, %7
+  %16 = or <8 x i64> %15, %8
+  ret <8 x i64> %16
+}
+
+define dso_local <8 x i64> @baz(<8 x i64> %0, <8 x i64> %1, <8 x i64> %2,
+                                <8 x i64> %3, <8 x i64> %4, <8 x i64> %5,
+                                <8 x i64> %6, <8 x i64> %7, <8 x i64> %8)
+    local_unnamed_addr {
+; CHECK-LABEL: baz:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       retq
+Entry:
+  %9 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %0,
+                                                            <8 x i64> %1,
+                                                            <8 x i64> %2,
+                                                            i32 254)
+  %10 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %3,
+                                                             <8 x i64> %4,
+                                                             <8 x i64> %5,
+                                                             i32 254)
+  %11 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %6,
+                                                             <8 x i64> %7,
+                                                             <8 x i64> %8,
+                                                             i32 254)
+  %12 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %9,
+                                                             <8 x i64> %10,
+                                                             <8 x i64> %11,
+                                                             i32 254)
+  ret <8 x i64> %12
+}
+
+declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>,
+                                                   <8 x i64>, i32 immarg)
+
+; 256-bit vector case — exercises VLX path.
+define <4 x i64> @foo_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) {
+; CHECK-LABEL: foo_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq $1
+; CHECK-NEXT:    retq
+  %or1 = or <4 x i64> %b, %a
+  %or2 = or <4 x i64> %or1, %c
+  %not = xor <4 x i64> %or2, splat (i64 -1)
+  ret <4 x i64> %not
+}
+
+; 128-bit vector case — exercises VLX path.
+define <2 x i64> @foo_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
+; CHECK-LABEL: foo_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq $1
+; CHECK-NEXT:    retq
+  %or1 = or <2 x i64> %b, %a
+  %or2 = or <2 x i64> %or1, %c
+  %not = xor <2 x i64> %or2, splat (i64 -1)
+  ret <2 x i64> %not
+}
+
+; Balanced OR tree — tests fallthrough from homogeneous-associative to generic.
+define <8 x i64> @balanced_or4(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                                <8 x i64> %d) {
+; CHECK-LABEL: balanced_or4:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq $254
+; CHECK-NOT:   vporq
+; CHECK:       retq
+  %t1 = or <8 x i64> %a, %b
+  %t2 = or <8 x i64> %c, %d
+  %res = or <8 x i64> %t1, %t2
+  ret <8 x i64> %res
+}
+
+; Multi-use operand — %a used in both ORs.  hasOneUse() on the shared
+; value must not block the combine.
+define <8 x i64> @shared_operand(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
+; CHECK-LABEL: shared_operand:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq
+; CHECK-NEXT:    retq
+  %or1 = or <8 x i64> %a, %b
+  %or2 = or <8 x i64> %a, %c
+  %res = and <8 x i64> %or1, %or2
+  ret <8 x i64> %res
+}