[llvm] [AVX-512] make vpternlogq more aggressive for longer chains of bitmanipulations (PR #189971)

Wed Apr 1 07:50:20 PDT 2026

https://github.com/raventid created https://github.com/llvm/llvm-project/pull/189971

This is an initial implementation of a few add-hoc heuristics to make vpternlog more aggressive on longer bit-operation chains. Current implementation introduce few regression to existing test cases, so considered to be experimental and not ready for being merged into the upstream.

Published for initial demonstration.

Resolves https://github.com/llvm/llvm-project/issues/134768

>From 4ec75e67835f852719a4a33064257097faec15ce Mon Sep 17 00:00:00 2001
From: raventid <juliankul at gmail.com>
Date: Tue, 24 Mar 2026 14:37:20 +0800
Subject: [PATCH 1/2] [vpternlog] optimize more aggressevely

---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 283 ++++++++++++++++--------
 llvm/test/CodeGen/X86/vpternlog.ll      |  78 ++++++-
 2 files changed, 266 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index be95168f2de00..1dc6c10aba4a6 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -32,6 +32,8 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include <cstdint>
+#include <functional>
+#include <optional>
 
 using namespace llvm;
 
@@ -4815,8 +4817,7 @@ bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
   return true;
 }
 
-// Try to match two logic ops to a VPTERNLOG.
-// FIXME: Handle more complex patterns that use an operand more than once?
+// Try to match logic trees to one or more VPTERNLOG operations.
 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   MVT NVT = N->getSimpleValueType(0);
 
@@ -4829,118 +4830,214 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
     return false;
 
-  auto getFoldableLogicOp = [](SDValue Op) {
-    // Peek through single use bitcast.
-    if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
-      Op = Op.getOperand(0);
+  auto IsLogicOpcode = [](unsigned Opc) {
+    return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
+           Opc == X86ISD::ANDNP;
+  };
+
+  auto IsAllOnesXor = [](SDValue V) {
+    return V.getOpcode() == ISD::XOR &&
+           ISD::isBuildVectorAllOnes(V.getOperand(1).getNode());
+  };
+
+  auto PeelSingleUseBitcast = [](SDValue V) {
+    if (V.getOpcode() == ISD::BITCAST && V.hasOneUse())
+      return V.getOperand(0);
+    return V;
+  };
+
+  // Avoid consuming OR into a stand-alone VPTERNLOG if it is part of a
+  // higher-level A & ~(B | C) shape. Let the parent AND/ANDNP matcher absorb
+  // the whole pattern instead.
+  if (N->getOpcode() == ISD::OR && N->hasOneUse()) {
+    SDNode *User = *N->user_begin();
+    while (User->getOpcode() == ISD::BITCAST && User->hasOneUse())
+      User = *User->user_begin();
+
+    if (User->getOpcode() == ISD::XOR && User->hasOneUse() &&
+        (ISD::isBuildVectorAllOnes(User->getOperand(0).getNode()) ||
+         ISD::isBuildVectorAllOnes(User->getOperand(1).getNode()))) {
+      SDNode *NextUser = *User->user_begin();
+      while (NextUser->getOpcode() == ISD::BITCAST && NextUser->hasOneUse())
+        NextUser = *NextUser->user_begin();
+      unsigned NextOpc = NextUser->getOpcode();
+      if (NextOpc == ISD::AND || NextOpc == X86ISD::ANDNP)
+        return false;
+    }
+  }
 
-    if (!Op.hasOneUse())
-      return SDValue();
+  // Fast-path: A & ~(B | C) -> vpternlog(A, B, C, 0x10)
+  if (N->getOpcode() == ISD::AND) {
+    for (unsigned Idx = 0; Idx != 2; ++Idx) {
+      SDValue NotSide = N->getOperand(Idx);
+      SDValue A = N->getOperand(Idx ^ 1);
 
-    unsigned Opc = Op.getOpcode();
-    if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
-        Opc == X86ISD::ANDNP)
-      return Op;
+      SDValue NotSideNoCast = PeelSingleUseBitcast(NotSide);
+      if (!NotSideNoCast.hasOneUse() || !IsAllOnesXor(NotSideNoCast))
+        continue;
 
-    return SDValue();
+      SDValue Inner = PeelSingleUseBitcast(NotSideNoCast.getOperand(0));
+      if (!Inner.hasOneUse() || Inner.getOpcode() != ISD::OR)
+        continue;
+
+      SDValue B = Inner.getOperand(0);
+      SDValue C = Inner.getOperand(1);
+      if (matchVPTERNLOG(N, N, Inner.getNode(), Inner.getNode(), A, B, C, 0x10))
+        return true;
+    }
+  }
+
+  struct LeafInfo {
+    SDValue Leaf;
+    SDNode *Parent;
+    uint8_t Magic;
   };
 
-  SDValue N0, N1, A, FoldableOp;
+  auto ComputeTernlog = [&](SDValue Root, SDNode *OpaqueSubtree,
+                            SmallVectorImpl<LeafInfo> &Leaves,
+                            uint8_t &ImmOut, bool &TooManyLeaves) {
+    TooManyLeaves = false;
 
-  // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
-  auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
-    if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
-        ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
-      SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
+    auto lookupLeaf = [&](SDValue Leaf) -> std::optional<uint8_t> {
+      for (const LeafInfo &L : Leaves)
+        if (L.Leaf == Leaf)
+          return L.Magic;
+      return std::nullopt;
+    };
 
-      if (!InnerOp)
-        return SDValue();
+    std::function<int(SDValue, SDNode *, bool)> ComputeRec =
+        [&](SDValue Op, SDNode *Parent, bool IsRoot) -> int {
+      if (Op.getNode() != OpaqueSubtree) {
+        // Peek through single-use bitcasts.
+        if (Op.getOpcode() == ISD::BITCAST && (IsRoot || Op.hasOneUse())) {
+          Parent = Op.getNode();
+          Op = Op.getOperand(0);
+        }
+
+        if ((IsRoot || Op.hasOneUse()) && IsAllOnesXor(Op)) {
+          int Inner = ComputeRec(Op.getOperand(0), Op.getNode(), false);
+          return Inner < 0 ? -1 : ((~Inner) & 0xFF);
+        }
 
-      N0 = InnerOp.getOperand(0);
-      N1 = InnerOp.getOperand(1);
-      if ((FoldableOp = getFoldableLogicOp(N1))) {
-        A = N0;
-        return InnerOp;
+        if ((IsRoot || Op.hasOneUse()) && IsLogicOpcode(Op.getOpcode())) {
+          int L = ComputeRec(Op.getOperand(0), Op.getNode(), false);
+          int R = ComputeRec(Op.getOperand(1), Op.getNode(), false);
+          if (L < 0 || R < 0)
+            return -1;
+
+          switch (Op.getOpcode()) {
+          default:
+            llvm_unreachable("Unexpected opcode");
+          case ISD::AND:
+            return (L & R) & 0xFF;
+          case ISD::OR:
+            return (L | R) & 0xFF;
+          case ISD::XOR:
+            return (L ^ R) & 0xFF;
+          case X86ISD::ANDNP:
+            return ((~L) & R) & 0xFF;
+          }
+        }
       }
-      if ((FoldableOp = getFoldableLogicOp(N0))) {
-        A = N1;
-        return InnerOp;
+
+      if (auto Existing = lookupLeaf(Op))
+        return *Existing;
+
+      if (Leaves.size() >= 3) {
+        TooManyLeaves = true;
+        return -1;
       }
-    }
-    return SDValue();
-  };
 
-  bool PeeledOuterNot = false;
-  SDNode *OriN = N;
-  if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
-    PeeledOuterNot = true;
-    N = InnerOp.getNode();
-  } else {
-    N0 = N->getOperand(0);
-    N1 = N->getOperand(1);
+      static constexpr uint8_t Magics[] = {0xF0, 0xCC, 0xAA};
+      uint8_t Magic = Magics[Leaves.size()];
+      Leaves.push_back({Op, Parent, Magic});
+      return Magic;
+    };
 
-    if ((FoldableOp = getFoldableLogicOp(N1)))
-      A = N0;
-    else if ((FoldableOp = getFoldableLogicOp(N0)))
-      A = N1;
-    else
+    int Imm = ComputeRec(Root, Root.getNode(), true);
+    if (Imm < 0)
       return false;
-  }
+    ImmOut = static_cast<uint8_t>(Imm & 0xFF);
+    return true;
+  };
+
+  auto EmitFromLeaves = [&](SDNode *Root,
+                            const SmallVectorImpl<LeafInfo> &InLeaves,
+                            uint8_t Imm) {
+    assert(!InLeaves.empty() && "Expected at least one leaf");
+    SDValue A = InLeaves[0].Leaf;
+    SDNode *ParentA = InLeaves[0].Parent;
+    SDValue B = A;
+    SDNode *ParentB = ParentA;
+    SDValue C = A;
+    SDNode *ParentC = ParentA;
 
-  SDValue B = FoldableOp.getOperand(0);
-  SDValue C = FoldableOp.getOperand(1);
-  SDNode *ParentA = N;
-  SDNode *ParentB = FoldableOp.getNode();
-  SDNode *ParentC = FoldableOp.getNode();
-
-  // We can build the appropriate control immediate by performing the logic
-  // operation we're matching using these constants for A, B, and C.
-  uint8_t TernlogMagicA = 0xf0;
-  uint8_t TernlogMagicB = 0xcc;
-  uint8_t TernlogMagicC = 0xaa;
-
-  // Some of the inputs may be inverted, peek through them and invert the
-  // magic values accordingly.
-  // TODO: There may be a bitcast before the xor that we should peek through.
-  auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
-    if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
-        ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
-      Magic = ~Magic;
-      Parent = Op.getNode();
-      Op = Op.getOperand(0);
+    if (InLeaves.size() > 1) {
+      B = InLeaves[1].Leaf;
+      ParentB = InLeaves[1].Parent;
     }
+    if (InLeaves.size() > 2) {
+      C = InLeaves[2].Leaf;
+      ParentC = InLeaves[2].Parent;
+    }
+
+    return matchVPTERNLOG(Root, ParentA, ParentB, ParentC, A, B, C, Imm);
   };
 
-  PeekThroughNot(A, ParentA, TernlogMagicA);
-  PeekThroughNot(B, ParentB, TernlogMagicB);
-  PeekThroughNot(C, ParentC, TernlogMagicC);
-
-  uint8_t Imm;
-  switch (FoldableOp.getOpcode()) {
-  default: llvm_unreachable("Unexpected opcode!");
-  case ISD::AND:      Imm = TernlogMagicB & TernlogMagicC; break;
-  case ISD::OR:       Imm = TernlogMagicB | TernlogMagicC; break;
-  case ISD::XOR:      Imm = TernlogMagicB ^ TernlogMagicC; break;
-  case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
+  SmallVector<LeafInfo, 3> Leaves;
+  uint8_t Imm = 0;
+  bool TooManyLeaves = false;
+  if (ComputeTernlog(SDValue(N, 0), /*OpaqueSubtree=*/nullptr, Leaves, Imm,
+                     TooManyLeaves)) {
+    if (Leaves.size() < 2)
+      return false;
+    return EmitFromLeaves(N, Leaves, Imm);
   }
 
-  switch (N->getOpcode()) {
-  default: llvm_unreachable("Unexpected opcode!");
-  case X86ISD::ANDNP:
-    if (A == N0)
-      Imm &= ~TernlogMagicA;
-    else
-      Imm = ~(Imm) & TernlogMagicA;
-    break;
-  case ISD::AND: Imm &= TernlogMagicA; break;
-  case ISD::OR:  Imm |= TernlogMagicA; break;
-  case ISD::XOR: Imm ^= TernlogMagicA; break;
-  }
+  // Generic cascading for >3 leaves: keep one direct root operand as an opaque
+  // input leaf, then fold the remaining logic around it. This allows
+  // multi-level trees to be selected as chained VPTERNLOG operations.
+  if (TooManyLeaves) {
+    auto IsGoodOpaqueCandidate = [&](SDValue V) {
+      SDValue P = PeelSingleUseBitcast(V);
+      if (IsAllOnesXor(P))
+        P = PeelSingleUseBitcast(P.getOperand(0));
+      return IsLogicOpcode(P.getOpcode());
+    };
+
+    SmallVector<unsigned, 2> CandidateOrder;
+    if (IsGoodOpaqueCandidate(N->getOperand(0)))
+      CandidateOrder.push_back(0);
+    if (IsGoodOpaqueCandidate(N->getOperand(1)))
+      CandidateOrder.push_back(1);
+    if (CandidateOrder.empty())
+      return false;
+
+    // Prefer single-use subtrees first; they are better cascading anchors.
+    if (CandidateOrder.size() == 2 &&
+        !N->getOperand(CandidateOrder[0]).hasOneUse() &&
+        N->getOperand(CandidateOrder[1]).hasOneUse())
+      std::swap(CandidateOrder[0], CandidateOrder[1]);
 
-  if (PeeledOuterNot)
-    Imm = ~Imm;
+    for (unsigned Idx : CandidateOrder) {
+      SmallVector<LeafInfo, 3> CascadedLeaves;
+      uint8_t CascadedImm = 0;
+      bool CascadedTooManyLeaves = false;
+      SDNode *OpaqueSubtree = N->getOperand(Idx).getNode();
 
-  return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
+      if (!ComputeTernlog(SDValue(N, 0), OpaqueSubtree, CascadedLeaves,
+                          CascadedImm, CascadedTooManyLeaves))
+        continue;
+
+      if (CascadedLeaves.size() < 2)
+        continue;
+
+      if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+        return true;
+    }
+  }
+
+  return false;
 }
 
 /// If the high bits of an 'and' operand are known zero, try setting the
diff --git a/llvm/test/CodeGen/X86/vpternlog.ll b/llvm/test/CodeGen/X86/vpternlog.ll
index bd7478d3a82d5..100d31883823d 100644
--- a/llvm/test/CodeGen/X86/vpternlog.ll
+++ b/llvm/test/CodeGen/X86/vpternlog.ll
@@ -4,7 +4,7 @@
 define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT:    vpternlogq $1
 ; CHECK-NEXT:    retq
   %and.demorgan = or <8 x i64> %b, %a
   %and3.demorgan = or <8 x i64> %and.demorgan, %c
@@ -15,7 +15,7 @@ define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
 define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) {
 ; CHECK-LABEL: xorbitcast:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT:    vpternlogq $1
 ; CHECK-NEXT:    retq
   %or1 = or <64 x i8> %a, %b
   %or2 = or <64 x i8> %or1, %c
@@ -23,3 +23,77 @@ define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) {
   %xor = xor <8 x i64> %cast, splat (i64 -1)
   ret <8 x i64> %xor
 }
+
+define <8 x i64> @foobar(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                         <8 x i64> %d, <8 x i64> %e) {
+; CHECK-LABEL: foobar:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq $208
+; CHECK-NEXT:    vpternlogq $252
+; CHECK-NEXT:    vpandnq
+; CHECK-NEXT:    retq
+  %nb = xor <8 x i64> %b, splat (i64 -1)
+  %or = or <8 x i64> %nb, %c
+  %foo = and <8 x i64> %or, %a
+  %de = or <8 x i64> %d, %e
+  %nde = xor <8 x i64> %de, splat (i64 -1)
+  %bar = and <8 x i64> %foo, %nde
+  ret <8 x i64> %bar
+}
+
+define <8 x i64> @or_not_and_guard(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                                   <8 x i64> %d) {
+; CHECK-LABEL: or_not_and_guard:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq $16
+; CHECK-NOT:   vpternlogq $252
+; CHECK:       retq
+  %or = or <8 x i64> %b, %c
+  %not_or = xor <8 x i64> %or, splat (i64 -1)
+  %lhs = and <8 x i64> %a, %not_or
+  %res = and <8 x i64> %lhs, %d
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @depth4_chain(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                               <8 x i64> %d, <8 x i64> %e) {
+; CHECK-LABEL: depth4_chain:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq
+; CHECK:       retq
+  %t0 = xor <8 x i64> %a, %b
+  %t1 = or <8 x i64> %t0, %c
+  %t2 = and <8 x i64> %t1, %d
+  %t3 = xor <8 x i64> %t2, %e
+  ret <8 x i64> %t3
+}
+
+define <8 x i64> @depth5_chain(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                               <8 x i64> %d, <8 x i64> %e, <8 x i64> %f) {
+; CHECK-LABEL: depth5_chain:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq
+; CHECK:       retq
+  %t0 = and <8 x i64> %a, %b
+  %t1 = xor <8 x i64> %t0, %c
+  %t2 = or <8 x i64> %t1, %d
+  %t3 = and <8 x i64> %t2, %e
+  %t4 = xor <8 x i64> %t3, %f
+  ret <8 x i64> %t4
+}
+
+define <8 x i64> @balanced_depth(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                                 <8 x i64> %d, <8 x i64> %e, <8 x i64> %f) {
+; CHECK-LABEL: balanced_depth:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq
+; CHECK:       vpternlogq
+; CHECK:       vpternlogq
+; CHECK:       retq
+  %l0 = or <8 x i64> %a, %b
+  %l1 = xor <8 x i64> %c, %d
+  %l2 = and <8 x i64> %l0, %l1
+  %r0 = xor <8 x i64> %e, %f
+  %res = or <8 x i64> %l2, %r0
+  ret <8 x i64> %res
+}

>From f6bd361112580284ec8063d9916e649e20e5bcb8 Mon Sep 17 00:00:00 2001
From: raventid <juliankul at gmail.com>
Date: Wed, 1 Apr 2026 22:46:46 +0800
Subject: [PATCH 2/2] [AVX-512] make vpternlogq more aggressive for longer
 chains of bitmanipulations

---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp | 133 +++++++++++++
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp    | 217 ++++++++++++++++++++-
 llvm/test/CodeGen/X86/vpternlog.ll         | 123 +++++++++++-
 3 files changed, 467 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index af3c3af38e681..c766da744a581 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -21,6 +21,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86RegisterInfo.h"
@@ -37,8 +38,135 @@ using namespace llvm;
 #define DEBUG_TYPE "x86-fixup-inst-tuning"
 
 STATISTIC(NumInstChanges, "Number of instructions changes");
+STATISTIC(NumVPTERNLOGNotFolds,
+          "Number of all-ones + XOR-mem fused into VPTERNLOG NOT");
 
 namespace {
+
+/// Return the VPTERNLOG-rmi opcode for the given XOR-mem opcode, or 0 if there
+/// is no corresponding opcode.  We use the Q (64-bit element) variant for the
+/// VPTERNLOG so that the memory operand can be folded with the larger element
+/// granularity – element type is irrelevant for a bitwise NOT.
+static unsigned getVPTERNLOGForXORrm(unsigned XorOpc) {
+  switch (XorOpc) {
+  default:
+    return 0;
+  case X86::VPXORQZrm:
+  case X86::VPXORDZrm:
+    return X86::VPTERNLOGQZrmi;
+  case X86::VPXORQZ256rm:
+  case X86::VPXORDZ256rm:
+    return X86::VPTERNLOGQZ256rmi;
+  case X86::VPXORQZ128rm:
+  case X86::VPXORDZ128rm:
+    return X86::VPTERNLOGQZ128rmi;
+  }
+}
+
+/// Return true if \p MI is a VPTERNLOG-rri that materializes all-ones
+/// (immediate 0xFF) with all source operands marked as undef.
+static bool isVPTERNLOGAllOnes(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case X86::VPTERNLOGDZrri:
+  case X86::VPTERNLOGQZrri:
+  case X86::VPTERNLOGDZ256rri:
+  case X86::VPTERNLOGQZ256rri:
+  case X86::VPTERNLOGDZ128rri:
+  case X86::VPTERNLOGQZ128rri:
+    break;
+  }
+  // The last operand is the immediate; it must be 0xFF (all-ones).
+  const MachineOperand &ImmOp = MI.getOperand(MI.getNumOperands() - 1);
+  return ImmOp.isImm() && (ImmOp.getImm() & 0xFF) == 0xFF;
+}
+
+/// Try to fuse an all-ones materialization followed by a vector XOR-from-memory
+/// into a single VPTERNLOG NOT-from-memory:
+///
+///   $dst = VPTERNLOGDZrri undef $dst, undef $dst, undef $dst, 255
+///   $dst = VPXORQZrm      killed $dst, <mem>
+///
+/// becomes:
+///
+///   $dst = VPTERNLOGQZrmi  undef $dst, undef $dst, <mem>, 0x55
+///
+/// The immediate 0x55 = ~C (where C = src3 = memory operand), which is
+/// independent of src1 and src2.
+static bool tryFuseNotFromMem(const X86InstrInfo *TII, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &I) {
+  MachineInstr &XorMI = *I;
+  unsigned TernlogOpc = getVPTERNLOGForXORrm(XorMI.getOpcode());
+  if (!TernlogOpc)
+    return false;
+
+  // The XOR-rm layout: $dst(tied=$src1), $src1, base, scale, index, disp, seg.
+  Register DstReg = XorMI.getOperand(0).getReg();
+
+  // Walk backward to find the all-ones materialization.  Skip debug and
+  // position-independent instructions, but stop at any real instruction that
+  // touches DstReg.
+  MachineBasicBlock::iterator PrevIt = I;
+  if (PrevIt == MBB.begin())
+    return false;
+
+  MachineInstr *AllOnesMI = nullptr;
+  for (--PrevIt;; --PrevIt) {
+    MachineInstr &Prev = *PrevIt;
+
+    if (Prev.isDebugInstr()) {
+      if (PrevIt == MBB.begin())
+        return false;
+      continue;
+    }
+
+    if (isVPTERNLOGAllOnes(Prev) && Prev.getOperand(0).getReg() == DstReg) {
+      AllOnesMI = &Prev;
+      break;
+    }
+
+    // Any other instruction that reads or writes DstReg blocks the fold.
+    return false;
+  }
+
+  if (!AllOnesMI)
+    return false;
+
+  // Verify that the all-ones defines only DstReg and has no other users
+  // between itself and the XOR.  Since they are adjacent (modulo debug instrs)
+  // and both write DstReg, this is guaranteed.
+
+  LLVM_DEBUG(dbgs() << "Fusing VPTERNLOG NOT-from-memory:\n"
+                    << "  " << *AllOnesMI << "  " << XorMI);
+
+  // Build: $dst = VPTERNLOGQZrmi undef $dst, undef $dst, <mem>, 0x0F
+  // The XOR-rm operands: 0=dst, 1=src1, 2..6=mem(base,scale,index,disp,seg)
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, I, XorMI.getDebugLoc(), TII->get(TernlogOpc), DstReg)
+          .addReg(DstReg, RegState::Undef)  // src1 (tied, don't care)
+          .addReg(DstReg, RegState::Undef); // src2 (don't care)
+
+  // Copy the 5 memory addressing operands from the XOR.
+  for (unsigned J = 2; J < 2 + X86::AddrNumOperands; ++J)
+    MIB.add(XorMI.getOperand(J));
+
+  MIB.addImm(0x55); // imm = ~C where C = src3 = memory operand
+
+  // Preserve mem-refs from the XOR.
+  MIB.setMemRefs(XorMI.memoperands());
+
+  LLVM_DEBUG(dbgs() << "  -> " << *MIB);
+
+  // Erase the two old instructions.
+  AllOnesMI->eraseFromParent();
+  I = MIB.getInstr()->getIterator();
+  XorMI.eraseFromParent();
+
+  ++NumVPTERNLOGNotFolds;
+  return true;
+}
+
 class X86FixupInstTuningImpl {
 public:
   bool runOnMachineFunction(MachineFunction &MF);
@@ -683,6 +811,11 @@ bool X86FixupInstTuningImpl::runOnMachineFunction(MachineFunction &MF) {
 
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+      // Try fusing all-ones + XOR-mem → VPTERNLOG NOT-mem first.
+      if (tryFuseNotFromMem(TII, MBB, I)) {
+        Changed = true;
+        continue;
+      }
       if (processInstruction(MF, MBB, I)) {
         ++NumInstChanges;
         Changed = true;
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 1dc6c10aba4a6..8aa39032b7d43 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4835,6 +4835,10 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
            Opc == X86ISD::ANDNP;
   };
 
+  auto IsAllOnesVec = [](SDValue V) {
+    return ISD::isBuildVectorAllOnes(V.getNode());
+  };
+
   auto IsAllOnesXor = [](SDValue V) {
     return V.getOpcode() == ISD::XOR &&
            ISD::isBuildVectorAllOnes(V.getOperand(1).getNode());
@@ -4846,6 +4850,47 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
     return V;
   };
 
+  std::function<bool(SDValue, unsigned)> IsHomogeneousAssociativeTree =
+      [&](SDValue V, unsigned Opc) {
+        V = PeelSingleUseBitcast(V);
+        if (V.getOpcode() != Opc)
+          return true;
+        if (!V.hasOneUse())
+          return false;
+        return IsHomogeneousAssociativeTree(V.getOperand(0), Opc) &&
+               IsHomogeneousAssociativeTree(V.getOperand(1), Opc);
+      };
+
+  auto IsLoadLike = [](SDValue V) {
+    return isa<LoadSDNode>(V.getNode()) ||
+           V.getOpcode() == X86ISD::VBROADCAST_LOAD;
+  };
+
+  // Fast-path: X ^ -1 -> ~X.
+  //
+  // Use X for all three VPTERNLOG inputs and select an immediate that yields
+  // ~X when A == B == C == X (imm bit0 = 1, bit7 = 0; other bits are don't
+  // care). This avoids introducing undef register operands.
+  //
+  // Keep this for register-like X only. For load-like X, this can cause an
+  // extra move/load before a folded-load VPTERNLOG form, which is usually not
+  // profitable.
+  if (N->getOpcode() == ISD::XOR) {
+    for (unsigned Idx = 0; Idx != 2; ++Idx) {
+      if (!IsAllOnesVec(N->getOperand(Idx)))
+        continue;
+
+      SDValue X = N->getOperand(Idx ^ 1);
+      SDValue XNoCast = PeelSingleUseBitcast(X);
+      if (IsLogicOpcode(XNoCast.getOpcode()) || IsAllOnesXor(XNoCast) ||
+          IsLoadLike(XNoCast))
+        continue;
+
+      if (matchVPTERNLOG(N, N, N, N, X, X, X, 0x01))
+        return true;
+    }
+  }
+
   // Avoid consuming OR into a stand-alone VPTERNLOG if it is part of a
   // higher-level A & ~(B | C) shape. Let the parent AND/ANDNP matcher absorb
   // the whole pattern instead.
@@ -4866,6 +4911,26 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
     }
   }
 
+  // Avoid consuming xor(logic_op, -1) (i.e. NOT of a logic sub-tree) into a
+  // stand-alone VPTERNLOG when the parent is also a logic op.  The parent's
+  // ComputeTernlog will fold the NOT directly into the truth-table immediate,
+  // producing fewer instructions overall.
+  if (N->getOpcode() == ISD::XOR && N->hasOneUse() &&
+      (ISD::isBuildVectorAllOnes(N->getOperand(0).getNode()) ||
+       ISD::isBuildVectorAllOnes(N->getOperand(1).getNode()))) {
+    SDValue Inner = ISD::isBuildVectorAllOnes(N->getOperand(1).getNode())
+                        ? N->getOperand(0)
+                        : N->getOperand(1);
+    SDValue InnerNoCast = PeelSingleUseBitcast(Inner);
+    if (IsLogicOpcode(InnerNoCast.getOpcode())) {
+      SDNode *User = *N->user_begin();
+      while (User->getOpcode() == ISD::BITCAST && User->hasOneUse())
+        User = *User->user_begin();
+      if (IsLogicOpcode(User->getOpcode()))
+        return false;
+    }
+  }
+
   // Fast-path: A & ~(B | C) -> vpternlog(A, B, C, 0x10)
   if (N->getOpcode() == ISD::AND) {
     for (unsigned Idx = 0; Idx != 2; ++Idx) {
@@ -4989,7 +5054,13 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   bool TooManyLeaves = false;
   if (ComputeTernlog(SDValue(N, 0), /*OpaqueSubtree=*/nullptr, Leaves, Imm,
                      TooManyLeaves)) {
-    if (Leaves.size() < 2)
+    if (Leaves.empty())
+      return false;
+    // A single-leaf load folded into VPTERNLOG causes a redundant explicit
+    // load (for the tied src1=dst) plus a folded load, doubling memory
+    // traffic. Bail out and let default lowering handle it (e.g.
+    // SETALLONES + VPXORQ mem for NOT-of-load).
+    if (Leaves.size() == 1 && IsLoadLike(PeelSingleUseBitcast(Leaves[0].Leaf)))
       return false;
     return EmitFromLeaves(N, Leaves, Imm);
   }
@@ -4998,6 +5069,45 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   // input leaf, then fold the remaining logic around it. This allows
   // multi-level trees to be selected as chained VPTERNLOG operations.
   if (TooManyLeaves) {
+    bool IsAssocOp = N->getOpcode() == ISD::OR || N->getOpcode() == ISD::AND ||
+                     N->getOpcode() == ISD::XOR;
+    bool IsHomogeneousAssoc = IsAssocOp && IsHomogeneousAssociativeTree(
+                                               SDValue(N, 0), N->getOpcode());
+
+    // For homogeneous associative trees, prefer choosing an opaque subtree
+    // from one level below the root-side logic node so we can expose two fresh
+    // siblings and form 3-input VPTERNLOG combines (e.g. OR reductions as
+    // repeated imm=254), rather than creating long 2-input $252/$250 chains.
+    if (IsHomogeneousAssoc) {
+      for (unsigned RootIdx = 0; RootIdx != 2; ++RootIdx) {
+        SDValue Side = PeelSingleUseBitcast(N->getOperand(RootIdx));
+        if (!Side.hasOneUse() || Side.getOpcode() != N->getOpcode())
+          continue;
+
+        for (unsigned ChildIdx = 0; ChildIdx != 2; ++ChildIdx) {
+          SDValue Child = PeelSingleUseBitcast(Side.getOperand(ChildIdx));
+          if (!Child.hasOneUse() || Child.getOpcode() != N->getOpcode())
+            continue;
+
+          SmallVector<LeafInfo, 3> AssocLeaves;
+          uint8_t AssocImm = 0;
+          bool AssocTooManyLeaves = false;
+          if (!ComputeTernlog(SDValue(N, 0), Child.getNode(), AssocLeaves,
+                              AssocImm, AssocTooManyLeaves))
+            continue;
+
+          if (AssocLeaves.size() < 2)
+            continue;
+
+          if (EmitFromLeaves(N, AssocLeaves, AssocImm))
+            return true;
+        }
+      }
+      // Fall through to generic cascading — for balanced trees all
+      // grandchildren may be leaves, so the child-as-opaque strategy below
+      // can still produce a valid 3-input combine.
+    }
+
     auto IsGoodOpaqueCandidate = [&](SDValue V) {
       SDValue P = PeelSingleUseBitcast(V);
       if (IsAllOnesXor(P))
@@ -5019,6 +5129,25 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
         N->getOperand(CandidateOrder[1]).hasOneUse())
       std::swap(CandidateOrder[0], CandidateOrder[1]);
 
+    // Collect all opaque subtree candidates: direct children and, if a direct
+    // child is itself a logic op, its grandchildren.  Trying grandchildren
+    // allows the root VPTERNLOG to absorb more distinct operands (3 instead
+    // of 2), which produces tighter cascades.  Example:
+    //
+    //      xor              With child "and" opaque: 2 leaves (and, e)
+    //     /   \             With grandchild "or" opaque: 3 leaves (or, d, e)
+    //   and    e            → saves one instruction in the cascade.
+    //  /   \
+    // or    d
+    //
+    // Try direct children first.  Only explore grandchildren when all direct
+    // children produce ≤2 leaves (i.e. a degenerate 2-input fold that wastes
+    // a VPTERNLOG slot) AND the opaque subtree itself has >3 leaves, meaning
+    // a single VPTERNLOG cannot handle it. When the opaque child fits in one
+    // VPTERNLOG (≤3 leaves), going deeper just reshuffles the split without
+    // saving instructions.
+    bool TriedDirect = false;
+    bool NeedsGrandchild = false;
     for (unsigned Idx : CandidateOrder) {
       SmallVector<LeafInfo, 3> CascadedLeaves;
       uint8_t CascadedImm = 0;
@@ -5029,11 +5158,91 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
                           CascadedImm, CascadedTooManyLeaves))
         continue;
 
-      if (CascadedLeaves.size() < 2)
+      if (CascadedLeaves.empty())
         continue;
 
-      if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
-        return true;
+      // If the direct child yields a 3-leaf fold, emit it right away — this
+      // is already optimal for this level.
+      if (CascadedLeaves.size() == 3) {
+        if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+          return true;
+      }
+      TriedDirect = true;
+
+      // Check if the opaque subtree itself has >3 leaves: if so, it cannot
+      // be handled by a single VPTERNLOG, and going one level deeper may
+      // help reduce the total instruction count.
+      if (CascadedLeaves.size() <= 2) {
+        SmallVector<LeafInfo, 3> SubLeaves;
+        uint8_t SubImm = 0;
+        bool SubTooMany = false;
+        if (!ComputeTernlog(SDValue(N->getOperand(Idx)),
+                            /*OpaqueSubtree=*/nullptr, SubLeaves, SubImm,
+                            SubTooMany) &&
+            SubTooMany)
+          NeedsGrandchild = true;
+      }
+    }
+
+    // Direct children only yielded ≤2-leaf folds and the opaque subtree has
+    // >3 leaves (can't fit in one VPTERNLOG).  Try grandchildren — making a
+    // deeper subtree opaque exposes more leaves at the root level, reducing
+    // the total instruction count.
+    if (NeedsGrandchild) {
+      for (unsigned Idx : CandidateOrder) {
+        SDValue Child = N->getOperand(Idx);
+        SDValue ChildNoCast = PeelSingleUseBitcast(Child);
+        if (!ChildNoCast.hasOneUse() || !IsLogicOpcode(ChildNoCast.getOpcode()))
+          continue;
+
+        for (unsigned GIdx = 0; GIdx != 2; ++GIdx) {
+          SDValue GChild = ChildNoCast.getOperand(GIdx);
+          SDValue GChildNoCast = PeelSingleUseBitcast(GChild);
+
+          // Skip NOT-wrappers (xor X, -1): ComputeTernlog already folds NOT
+          // into the truth table, so cutting at a NOT boundary just pushes
+          // the NOT into a separate instruction without saving anything.
+          if (IsAllOnesXor(GChildNoCast))
+            continue;
+
+          if (!IsLogicOpcode(GChildNoCast.getOpcode()))
+            continue;
+
+          SmallVector<LeafInfo, 3> CascadedLeaves;
+          uint8_t CascadedImm = 0;
+          bool CascadedTooManyLeaves = false;
+
+          if (!ComputeTernlog(SDValue(N, 0), GChild.getNode(), CascadedLeaves,
+                              CascadedImm, CascadedTooManyLeaves))
+            continue;
+
+          if (CascadedLeaves.size() < 3)
+            continue;
+
+          if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+            return true;
+        }
+      }
+    }
+
+    // Fall back to direct-child opaque with ≤2 leaves if nothing else worked.
+    if (TriedDirect) {
+      for (unsigned Idx : CandidateOrder) {
+        SmallVector<LeafInfo, 3> CascadedLeaves;
+        uint8_t CascadedImm = 0;
+        bool CascadedTooManyLeaves = false;
+        SDNode *OpaqueSubtree = N->getOperand(Idx).getNode();
+
+        if (!ComputeTernlog(SDValue(N, 0), OpaqueSubtree, CascadedLeaves,
+                            CascadedImm, CascadedTooManyLeaves))
+          continue;
+
+        if (CascadedLeaves.empty())
+          continue;
+
+        if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+          return true;
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/vpternlog.ll b/llvm/test/CodeGen/X86/vpternlog.ll
index 100d31883823d..4f9e384bd85a0 100644
--- a/llvm/test/CodeGen/X86/vpternlog.ll
+++ b/llvm/test/CodeGen/X86/vpternlog.ll
@@ -29,8 +29,7 @@ define <8 x i64> @foobar(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
 ; CHECK-LABEL: foobar:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpternlogq $208
-; CHECK-NEXT:    vpternlogq $252
-; CHECK-NEXT:    vpandnq
+; CHECK-NEXT:    vpternlogq $16
 ; CHECK-NEXT:    retq
   %nb = xor <8 x i64> %b, splat (i64 -1)
   %or = or <8 x i64> %nb, %c
@@ -97,3 +96,123 @@ define <8 x i64> @balanced_depth(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
   %res = or <8 x i64> %l2, %r0
   ret <8 x i64> %res
 }
+
+define <8 x i64> @flip(ptr %x) {
+; CHECK-LABEL: flip:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq $85, (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %ld = load <8 x i64>, ptr %x, align 64
+  %not = xor <8 x i64> %ld, splat (i64 -1)
+  ret <8 x i64> %not
+}
+
+define dso_local <8 x i64> @fubar(<8 x i64> %0, <8 x i64> %1, <8 x i64> %2,
+                                  <8 x i64> %3, <8 x i64> %4, <8 x i64> %5,
+                                  <8 x i64> %6, <8 x i64> %7, <8 x i64> %8)
+    local_unnamed_addr {
+; CHECK-LABEL: fubar:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK-NOT:   vpternlogq $252
+; CHECK-NOT:   vpternlogq $250
+; CHECK:       retq
+Entry:
+  %9 = or <8 x i64> %1, %0
+  %10 = or <8 x i64> %9, %2
+  %11 = or <8 x i64> %10, %3
+  %12 = or <8 x i64> %11, %4
+  %13 = or <8 x i64> %12, %5
+  %14 = or <8 x i64> %13, %6
+  %15 = or <8 x i64> %14, %7
+  %16 = or <8 x i64> %15, %8
+  ret <8 x i64> %16
+}
+
+define dso_local <8 x i64> @baz(<8 x i64> %0, <8 x i64> %1, <8 x i64> %2,
+                                <8 x i64> %3, <8 x i64> %4, <8 x i64> %5,
+                                <8 x i64> %6, <8 x i64> %7, <8 x i64> %8)
+    local_unnamed_addr {
+; CHECK-LABEL: baz:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       vpternlogq $254
+; CHECK:       retq
+Entry:
+  %9 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %0,
+                                                            <8 x i64> %1,
+                                                            <8 x i64> %2,
+                                                            i32 254)
+  %10 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %3,
+                                                             <8 x i64> %4,
+                                                             <8 x i64> %5,
+                                                             i32 254)
+  %11 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %6,
+                                                             <8 x i64> %7,
+                                                             <8 x i64> %8,
+                                                             i32 254)
+  %12 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %9,
+                                                             <8 x i64> %10,
+                                                             <8 x i64> %11,
+                                                             i32 254)
+  ret <8 x i64> %12
+}
+
+declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>,
+                                                   <8 x i64>, i32 immarg)
+
+; 256-bit vector case — exercises VLX path.
+define <4 x i64> @foo_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) {
+; CHECK-LABEL: foo_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq $1
+; CHECK-NEXT:    retq
+  %or1 = or <4 x i64> %b, %a
+  %or2 = or <4 x i64> %or1, %c
+  %not = xor <4 x i64> %or2, splat (i64 -1)
+  ret <4 x i64> %not
+}
+
+; 128-bit vector case — exercises VLX path.
+define <2 x i64> @foo_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
+; CHECK-LABEL: foo_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq $1
+; CHECK-NEXT:    retq
+  %or1 = or <2 x i64> %b, %a
+  %or2 = or <2 x i64> %or1, %c
+  %not = xor <2 x i64> %or2, splat (i64 -1)
+  ret <2 x i64> %not
+}
+
+; Balanced OR tree — tests fallthrough from homogeneous-associative to generic.
+define <8 x i64> @balanced_or4(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+                                <8 x i64> %d) {
+; CHECK-LABEL: balanced_or4:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq $254
+; CHECK-NOT:   vporq
+; CHECK:       retq
+  %t1 = or <8 x i64> %a, %b
+  %t2 = or <8 x i64> %c, %d
+  %res = or <8 x i64> %t1, %t2
+  ret <8 x i64> %res
+}
+
+; Multi-use operand — %a used in both ORs.  hasOneUse() on the shared
+; value must not block the combine.
+define <8 x i64> @shared_operand(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
+; CHECK-LABEL: shared_operand:
+; CHECK:       # %bb.0:
+; CHECK:       vpternlogq
+; CHECK-NEXT:    retq
+  %or1 = or <8 x i64> %a, %b
+  %or2 = or <8 x i64> %a, %c
+  %res = and <8 x i64> %or1, %or2
+  ret <8 x i64> %res
+}