[llvm] [AVX-512] make vpternlogq more aggressive for longer chains of bitmanipulations (PR #189971)
Julian Pokrovsky via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 08:09:25 PDT 2026
https://github.com/raventid updated https://github.com/llvm/llvm-project/pull/189971
>From 4ec75e67835f852719a4a33064257097faec15ce Mon Sep 17 00:00:00 2001
From: raventid <juliankul at gmail.com>
Date: Tue, 24 Mar 2026 14:37:20 +0800
Subject: [PATCH 1/2] [vpternlog] optimize more aggressevely
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 283 ++++++++++++++++--------
llvm/test/CodeGen/X86/vpternlog.ll | 78 ++++++-
2 files changed, 266 insertions(+), 95 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index be95168f2de00..1dc6c10aba4a6 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -32,6 +32,8 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include <cstdint>
+#include <functional>
+#include <optional>
using namespace llvm;
@@ -4815,8 +4817,7 @@ bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
return true;
}
-// Try to match two logic ops to a VPTERNLOG.
-// FIXME: Handle more complex patterns that use an operand more than once?
+// Try to match logic trees to one or more VPTERNLOG operations.
bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
MVT NVT = N->getSimpleValueType(0);
@@ -4829,118 +4830,214 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
return false;
- auto getFoldableLogicOp = [](SDValue Op) {
- // Peek through single use bitcast.
- if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
- Op = Op.getOperand(0);
+ auto IsLogicOpcode = [](unsigned Opc) {
+ return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
+ Opc == X86ISD::ANDNP;
+ };
+
+ auto IsAllOnesXor = [](SDValue V) {
+ return V.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(V.getOperand(1).getNode());
+ };
+
+ auto PeelSingleUseBitcast = [](SDValue V) {
+ if (V.getOpcode() == ISD::BITCAST && V.hasOneUse())
+ return V.getOperand(0);
+ return V;
+ };
+
+ // Avoid consuming OR into a stand-alone VPTERNLOG if it is part of a
+ // higher-level A & ~(B | C) shape. Let the parent AND/ANDNP matcher absorb
+ // the whole pattern instead.
+ if (N->getOpcode() == ISD::OR && N->hasOneUse()) {
+ SDNode *User = *N->user_begin();
+ while (User->getOpcode() == ISD::BITCAST && User->hasOneUse())
+ User = *User->user_begin();
+
+ if (User->getOpcode() == ISD::XOR && User->hasOneUse() &&
+ (ISD::isBuildVectorAllOnes(User->getOperand(0).getNode()) ||
+ ISD::isBuildVectorAllOnes(User->getOperand(1).getNode()))) {
+ SDNode *NextUser = *User->user_begin();
+ while (NextUser->getOpcode() == ISD::BITCAST && NextUser->hasOneUse())
+ NextUser = *NextUser->user_begin();
+ unsigned NextOpc = NextUser->getOpcode();
+ if (NextOpc == ISD::AND || NextOpc == X86ISD::ANDNP)
+ return false;
+ }
+ }
- if (!Op.hasOneUse())
- return SDValue();
+ // Fast-path: A & ~(B | C) -> vpternlog(A, B, C, 0x10)
+ if (N->getOpcode() == ISD::AND) {
+ for (unsigned Idx = 0; Idx != 2; ++Idx) {
+ SDValue NotSide = N->getOperand(Idx);
+ SDValue A = N->getOperand(Idx ^ 1);
- unsigned Opc = Op.getOpcode();
- if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
- Opc == X86ISD::ANDNP)
- return Op;
+ SDValue NotSideNoCast = PeelSingleUseBitcast(NotSide);
+ if (!NotSideNoCast.hasOneUse() || !IsAllOnesXor(NotSideNoCast))
+ continue;
- return SDValue();
+ SDValue Inner = PeelSingleUseBitcast(NotSideNoCast.getOperand(0));
+ if (!Inner.hasOneUse() || Inner.getOpcode() != ISD::OR)
+ continue;
+
+ SDValue B = Inner.getOperand(0);
+ SDValue C = Inner.getOperand(1);
+ if (matchVPTERNLOG(N, N, Inner.getNode(), Inner.getNode(), A, B, C, 0x10))
+ return true;
+ }
+ }
+
+ struct LeafInfo {
+ SDValue Leaf;
+ SDNode *Parent;
+ uint8_t Magic;
};
- SDValue N0, N1, A, FoldableOp;
+ auto ComputeTernlog = [&](SDValue Root, SDNode *OpaqueSubtree,
+ SmallVectorImpl<LeafInfo> &Leaves,
+ uint8_t &ImmOut, bool &TooManyLeaves) {
+ TooManyLeaves = false;
- // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
- auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
- if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
- ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
- SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
+ auto lookupLeaf = [&](SDValue Leaf) -> std::optional<uint8_t> {
+ for (const LeafInfo &L : Leaves)
+ if (L.Leaf == Leaf)
+ return L.Magic;
+ return std::nullopt;
+ };
- if (!InnerOp)
- return SDValue();
+ std::function<int(SDValue, SDNode *, bool)> ComputeRec =
+ [&](SDValue Op, SDNode *Parent, bool IsRoot) -> int {
+ if (Op.getNode() != OpaqueSubtree) {
+ // Peek through single-use bitcasts.
+ if (Op.getOpcode() == ISD::BITCAST && (IsRoot || Op.hasOneUse())) {
+ Parent = Op.getNode();
+ Op = Op.getOperand(0);
+ }
+
+ if ((IsRoot || Op.hasOneUse()) && IsAllOnesXor(Op)) {
+ int Inner = ComputeRec(Op.getOperand(0), Op.getNode(), false);
+ return Inner < 0 ? -1 : ((~Inner) & 0xFF);
+ }
- N0 = InnerOp.getOperand(0);
- N1 = InnerOp.getOperand(1);
- if ((FoldableOp = getFoldableLogicOp(N1))) {
- A = N0;
- return InnerOp;
+ if ((IsRoot || Op.hasOneUse()) && IsLogicOpcode(Op.getOpcode())) {
+ int L = ComputeRec(Op.getOperand(0), Op.getNode(), false);
+ int R = ComputeRec(Op.getOperand(1), Op.getNode(), false);
+ if (L < 0 || R < 0)
+ return -1;
+
+ switch (Op.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case ISD::AND:
+ return (L & R) & 0xFF;
+ case ISD::OR:
+ return (L | R) & 0xFF;
+ case ISD::XOR:
+ return (L ^ R) & 0xFF;
+ case X86ISD::ANDNP:
+ return ((~L) & R) & 0xFF;
+ }
+ }
}
- if ((FoldableOp = getFoldableLogicOp(N0))) {
- A = N1;
- return InnerOp;
+
+ if (auto Existing = lookupLeaf(Op))
+ return *Existing;
+
+ if (Leaves.size() >= 3) {
+ TooManyLeaves = true;
+ return -1;
}
- }
- return SDValue();
- };
- bool PeeledOuterNot = false;
- SDNode *OriN = N;
- if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
- PeeledOuterNot = true;
- N = InnerOp.getNode();
- } else {
- N0 = N->getOperand(0);
- N1 = N->getOperand(1);
+ static constexpr uint8_t Magics[] = {0xF0, 0xCC, 0xAA};
+ uint8_t Magic = Magics[Leaves.size()];
+ Leaves.push_back({Op, Parent, Magic});
+ return Magic;
+ };
- if ((FoldableOp = getFoldableLogicOp(N1)))
- A = N0;
- else if ((FoldableOp = getFoldableLogicOp(N0)))
- A = N1;
- else
+ int Imm = ComputeRec(Root, Root.getNode(), true);
+ if (Imm < 0)
return false;
- }
+ ImmOut = static_cast<uint8_t>(Imm & 0xFF);
+ return true;
+ };
+
+ auto EmitFromLeaves = [&](SDNode *Root,
+ const SmallVectorImpl<LeafInfo> &InLeaves,
+ uint8_t Imm) {
+ assert(!InLeaves.empty() && "Expected at least one leaf");
+ SDValue A = InLeaves[0].Leaf;
+ SDNode *ParentA = InLeaves[0].Parent;
+ SDValue B = A;
+ SDNode *ParentB = ParentA;
+ SDValue C = A;
+ SDNode *ParentC = ParentA;
- SDValue B = FoldableOp.getOperand(0);
- SDValue C = FoldableOp.getOperand(1);
- SDNode *ParentA = N;
- SDNode *ParentB = FoldableOp.getNode();
- SDNode *ParentC = FoldableOp.getNode();
-
- // We can build the appropriate control immediate by performing the logic
- // operation we're matching using these constants for A, B, and C.
- uint8_t TernlogMagicA = 0xf0;
- uint8_t TernlogMagicB = 0xcc;
- uint8_t TernlogMagicC = 0xaa;
-
- // Some of the inputs may be inverted, peek through them and invert the
- // magic values accordingly.
- // TODO: There may be a bitcast before the xor that we should peek through.
- auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
- if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
- ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
- Magic = ~Magic;
- Parent = Op.getNode();
- Op = Op.getOperand(0);
+ if (InLeaves.size() > 1) {
+ B = InLeaves[1].Leaf;
+ ParentB = InLeaves[1].Parent;
}
+ if (InLeaves.size() > 2) {
+ C = InLeaves[2].Leaf;
+ ParentC = InLeaves[2].Parent;
+ }
+
+ return matchVPTERNLOG(Root, ParentA, ParentB, ParentC, A, B, C, Imm);
};
- PeekThroughNot(A, ParentA, TernlogMagicA);
- PeekThroughNot(B, ParentB, TernlogMagicB);
- PeekThroughNot(C, ParentC, TernlogMagicC);
-
- uint8_t Imm;
- switch (FoldableOp.getOpcode()) {
- default: llvm_unreachable("Unexpected opcode!");
- case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
- case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
- case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
- case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
+ SmallVector<LeafInfo, 3> Leaves;
+ uint8_t Imm = 0;
+ bool TooManyLeaves = false;
+ if (ComputeTernlog(SDValue(N, 0), /*OpaqueSubtree=*/nullptr, Leaves, Imm,
+ TooManyLeaves)) {
+ if (Leaves.size() < 2)
+ return false;
+ return EmitFromLeaves(N, Leaves, Imm);
}
- switch (N->getOpcode()) {
- default: llvm_unreachable("Unexpected opcode!");
- case X86ISD::ANDNP:
- if (A == N0)
- Imm &= ~TernlogMagicA;
- else
- Imm = ~(Imm) & TernlogMagicA;
- break;
- case ISD::AND: Imm &= TernlogMagicA; break;
- case ISD::OR: Imm |= TernlogMagicA; break;
- case ISD::XOR: Imm ^= TernlogMagicA; break;
- }
+ // Generic cascading for >3 leaves: keep one direct root operand as an opaque
+ // input leaf, then fold the remaining logic around it. This allows
+ // multi-level trees to be selected as chained VPTERNLOG operations.
+ if (TooManyLeaves) {
+ auto IsGoodOpaqueCandidate = [&](SDValue V) {
+ SDValue P = PeelSingleUseBitcast(V);
+ if (IsAllOnesXor(P))
+ P = PeelSingleUseBitcast(P.getOperand(0));
+ return IsLogicOpcode(P.getOpcode());
+ };
+
+ SmallVector<unsigned, 2> CandidateOrder;
+ if (IsGoodOpaqueCandidate(N->getOperand(0)))
+ CandidateOrder.push_back(0);
+ if (IsGoodOpaqueCandidate(N->getOperand(1)))
+ CandidateOrder.push_back(1);
+ if (CandidateOrder.empty())
+ return false;
+
+ // Prefer single-use subtrees first; they are better cascading anchors.
+ if (CandidateOrder.size() == 2 &&
+ !N->getOperand(CandidateOrder[0]).hasOneUse() &&
+ N->getOperand(CandidateOrder[1]).hasOneUse())
+ std::swap(CandidateOrder[0], CandidateOrder[1]);
- if (PeeledOuterNot)
- Imm = ~Imm;
+ for (unsigned Idx : CandidateOrder) {
+ SmallVector<LeafInfo, 3> CascadedLeaves;
+ uint8_t CascadedImm = 0;
+ bool CascadedTooManyLeaves = false;
+ SDNode *OpaqueSubtree = N->getOperand(Idx).getNode();
- return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
+ if (!ComputeTernlog(SDValue(N, 0), OpaqueSubtree, CascadedLeaves,
+ CascadedImm, CascadedTooManyLeaves))
+ continue;
+
+ if (CascadedLeaves.size() < 2)
+ continue;
+
+ if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+ return true;
+ }
+ }
+
+ return false;
}
/// If the high bits of an 'and' operand are known zero, try setting the
diff --git a/llvm/test/CodeGen/X86/vpternlog.ll b/llvm/test/CodeGen/X86/vpternlog.ll
index bd7478d3a82d5..100d31883823d 100644
--- a/llvm/test/CodeGen/X86/vpternlog.ll
+++ b/llvm/test/CodeGen/X86/vpternlog.ll
@@ -4,7 +4,7 @@
define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT: vpternlogq $1
; CHECK-NEXT: retq
%and.demorgan = or <8 x i64> %b, %a
%and3.demorgan = or <8 x i64> %and.demorgan, %c
@@ -15,7 +15,7 @@ define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) {
; CHECK-LABEL: xorbitcast:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT: vpternlogq $1
; CHECK-NEXT: retq
%or1 = or <64 x i8> %a, %b
%or2 = or <64 x i8> %or1, %c
@@ -23,3 +23,77 @@ define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) {
%xor = xor <8 x i64> %cast, splat (i64 -1)
ret <8 x i64> %xor
}
+
+define <8 x i64> @foobar(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+ <8 x i64> %d, <8 x i64> %e) {
+; CHECK-LABEL: foobar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpternlogq $208
+; CHECK-NEXT: vpternlogq $252
+; CHECK-NEXT: vpandnq
+; CHECK-NEXT: retq
+ %nb = xor <8 x i64> %b, splat (i64 -1)
+ %or = or <8 x i64> %nb, %c
+ %foo = and <8 x i64> %or, %a
+ %de = or <8 x i64> %d, %e
+ %nde = xor <8 x i64> %de, splat (i64 -1)
+ %bar = and <8 x i64> %foo, %nde
+ ret <8 x i64> %bar
+}
+
+define <8 x i64> @or_not_and_guard(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+ <8 x i64> %d) {
+; CHECK-LABEL: or_not_and_guard:
+; CHECK: # %bb.0:
+; CHECK: vpternlogq $16
+; CHECK-NOT: vpternlogq $252
+; CHECK: retq
+ %or = or <8 x i64> %b, %c
+ %not_or = xor <8 x i64> %or, splat (i64 -1)
+ %lhs = and <8 x i64> %a, %not_or
+ %res = and <8 x i64> %lhs, %d
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @depth4_chain(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+ <8 x i64> %d, <8 x i64> %e) {
+; CHECK-LABEL: depth4_chain:
+; CHECK: # %bb.0:
+; CHECK: vpternlogq
+; CHECK: retq
+ %t0 = xor <8 x i64> %a, %b
+ %t1 = or <8 x i64> %t0, %c
+ %t2 = and <8 x i64> %t1, %d
+ %t3 = xor <8 x i64> %t2, %e
+ ret <8 x i64> %t3
+}
+
+define <8 x i64> @depth5_chain(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+ <8 x i64> %d, <8 x i64> %e, <8 x i64> %f) {
+; CHECK-LABEL: depth5_chain:
+; CHECK: # %bb.0:
+; CHECK: vpternlogq
+; CHECK: retq
+ %t0 = and <8 x i64> %a, %b
+ %t1 = xor <8 x i64> %t0, %c
+ %t2 = or <8 x i64> %t1, %d
+ %t3 = and <8 x i64> %t2, %e
+ %t4 = xor <8 x i64> %t3, %f
+ ret <8 x i64> %t4
+}
+
+define <8 x i64> @balanced_depth(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+ <8 x i64> %d, <8 x i64> %e, <8 x i64> %f) {
+; CHECK-LABEL: balanced_depth:
+; CHECK: # %bb.0:
+; CHECK: vpternlogq
+; CHECK: vpternlogq
+; CHECK: vpternlogq
+; CHECK: retq
+ %l0 = or <8 x i64> %a, %b
+ %l1 = xor <8 x i64> %c, %d
+ %l2 = and <8 x i64> %l0, %l1
+ %r0 = xor <8 x i64> %e, %f
+ %res = or <8 x i64> %l2, %r0
+ ret <8 x i64> %res
+}
>From ecc91eba380937db5e8d1ae26913155f441ec792 Mon Sep 17 00:00:00 2001
From: raventid <juliankul at gmail.com>
Date: Wed, 1 Apr 2026 22:46:46 +0800
Subject: [PATCH 2/2] [AVX-512] make vpternlogq more aggressive for longer
chains of bitmanipulations
---
llvm/lib/Target/X86/X86FixupInstTuning.cpp | 133 +++++++++++++
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 221 ++++++++++++++++++++-
llvm/test/CodeGen/X86/vpternlog.ll | 123 +++++++++++-
3 files changed, 469 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index af3c3af38e681..c766da744a581 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -21,6 +21,7 @@
//
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/X86BaseInfo.h"
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86RegisterInfo.h"
@@ -37,8 +38,135 @@ using namespace llvm;
#define DEBUG_TYPE "x86-fixup-inst-tuning"
STATISTIC(NumInstChanges, "Number of instructions changes");
+STATISTIC(NumVPTERNLOGNotFolds,
+ "Number of all-ones + XOR-mem fused into VPTERNLOG NOT");
namespace {
+
+/// Return the VPTERNLOG-rmi opcode for the given XOR-mem opcode, or 0 if there
+/// is no corresponding opcode. We use the Q (64-bit element) variant for the
+/// VPTERNLOG so that the memory operand can be folded with the larger element
+/// granularity – element type is irrelevant for a bitwise NOT.
+static unsigned getVPTERNLOGForXORrm(unsigned XorOpc) {
+ switch (XorOpc) {
+ default:
+ return 0;
+ case X86::VPXORQZrm:
+ case X86::VPXORDZrm:
+ return X86::VPTERNLOGQZrmi;
+ case X86::VPXORQZ256rm:
+ case X86::VPXORDZ256rm:
+ return X86::VPTERNLOGQZ256rmi;
+ case X86::VPXORQZ128rm:
+ case X86::VPXORDZ128rm:
+ return X86::VPTERNLOGQZ128rmi;
+ }
+}
+
+/// Return true if \p MI is a VPTERNLOG-rri that materializes all-ones
+/// (immediate 0xFF) with all source operands marked as undef.
+static bool isVPTERNLOGAllOnes(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case X86::VPTERNLOGDZrri:
+ case X86::VPTERNLOGQZrri:
+ case X86::VPTERNLOGDZ256rri:
+ case X86::VPTERNLOGQZ256rri:
+ case X86::VPTERNLOGDZ128rri:
+ case X86::VPTERNLOGQZ128rri:
+ break;
+ }
+ // The last operand is the immediate; it must be 0xFF (all-ones).
+ const MachineOperand &ImmOp = MI.getOperand(MI.getNumOperands() - 1);
+ return ImmOp.isImm() && (ImmOp.getImm() & 0xFF) == 0xFF;
+}
+
+/// Try to fuse an all-ones materialization followed by a vector XOR-from-memory
+/// into a single VPTERNLOG NOT-from-memory:
+///
+/// $dst = VPTERNLOGDZrri undef $dst, undef $dst, undef $dst, 255
+/// $dst = VPXORQZrm killed $dst, <mem>
+///
+/// becomes:
+///
+/// $dst = VPTERNLOGQZrmi undef $dst, undef $dst, <mem>, 0x55
+///
+/// The immediate 0x55 = ~C (where C = src3 = memory operand), which is
+/// independent of src1 and src2.
+static bool tryFuseNotFromMem(const X86InstrInfo *TII, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &I) {
+ MachineInstr &XorMI = *I;
+ unsigned TernlogOpc = getVPTERNLOGForXORrm(XorMI.getOpcode());
+ if (!TernlogOpc)
+ return false;
+
+ // The XOR-rm layout: $dst(tied=$src1), $src1, base, scale, index, disp, seg.
+ Register DstReg = XorMI.getOperand(0).getReg();
+
+ // Walk backward to find the all-ones materialization. Skip debug and
+ // position-independent instructions, but stop at any real instruction that
+ // touches DstReg.
+ MachineBasicBlock::iterator PrevIt = I;
+ if (PrevIt == MBB.begin())
+ return false;
+
+ MachineInstr *AllOnesMI = nullptr;
+ for (--PrevIt;; --PrevIt) {
+ MachineInstr &Prev = *PrevIt;
+
+ if (Prev.isDebugInstr()) {
+ if (PrevIt == MBB.begin())
+ return false;
+ continue;
+ }
+
+ if (isVPTERNLOGAllOnes(Prev) && Prev.getOperand(0).getReg() == DstReg) {
+ AllOnesMI = &Prev;
+ break;
+ }
+
+ // Any other instruction that reads or writes DstReg blocks the fold.
+ return false;
+ }
+
+ if (!AllOnesMI)
+ return false;
+
+ // Verify that the all-ones defines only DstReg and has no other users
+ // between itself and the XOR. Since they are adjacent (modulo debug instrs)
+ // and both write DstReg, this is guaranteed.
+
+ LLVM_DEBUG(dbgs() << "Fusing VPTERNLOG NOT-from-memory:\n"
+ << " " << *AllOnesMI << " " << XorMI);
+
+ // Build: $dst = VPTERNLOGQZrmi undef $dst, undef $dst, <mem>, 0x0F
+ // The XOR-rm operands: 0=dst, 1=src1, 2..6=mem(base,scale,index,disp,seg)
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, I, XorMI.getDebugLoc(), TII->get(TernlogOpc), DstReg)
+ .addReg(DstReg, RegState::Undef) // src1 (tied, don't care)
+ .addReg(DstReg, RegState::Undef); // src2 (don't care)
+
+ // Copy the 5 memory addressing operands from the XOR.
+ for (unsigned J = 2; J < 2 + X86::AddrNumOperands; ++J)
+ MIB.add(XorMI.getOperand(J));
+
+ MIB.addImm(0x55); // imm = ~C where C = src3 = memory operand
+
+ // Preserve mem-refs from the XOR.
+ MIB.setMemRefs(XorMI.memoperands());
+
+ LLVM_DEBUG(dbgs() << " -> " << *MIB);
+
+ // Erase the two old instructions.
+ AllOnesMI->eraseFromParent();
+ I = MIB.getInstr()->getIterator();
+ XorMI.eraseFromParent();
+
+ ++NumVPTERNLOGNotFolds;
+ return true;
+}
+
class X86FixupInstTuningImpl {
public:
bool runOnMachineFunction(MachineFunction &MF);
@@ -683,6 +811,11 @@ bool X86FixupInstTuningImpl::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ // Try fusing all-ones + XOR-mem → VPTERNLOG NOT-mem first.
+ if (tryFuseNotFromMem(TII, MBB, I)) {
+ Changed = true;
+ continue;
+ }
if (processInstruction(MF, MBB, I)) {
++NumInstChanges;
Changed = true;
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 1dc6c10aba4a6..17c7d7dfe92a5 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4835,6 +4835,10 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
Opc == X86ISD::ANDNP;
};
+ auto IsAllOnesVec = [](SDValue V) {
+ return ISD::isBuildVectorAllOnes(V.getNode());
+ };
+
auto IsAllOnesXor = [](SDValue V) {
return V.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(V.getOperand(1).getNode());
@@ -4846,6 +4850,47 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
return V;
};
+ std::function<bool(SDValue, unsigned)> IsHomogeneousAssociativeTree =
+ [&](SDValue V, unsigned Opc) {
+ V = PeelSingleUseBitcast(V);
+ if (V.getOpcode() != Opc)
+ return true;
+ if (!V.hasOneUse())
+ return false;
+ return IsHomogeneousAssociativeTree(V.getOperand(0), Opc) &&
+ IsHomogeneousAssociativeTree(V.getOperand(1), Opc);
+ };
+
+ auto IsLoadLike = [](SDValue V) {
+ return isa<LoadSDNode>(V.getNode()) ||
+ V.getOpcode() == X86ISD::VBROADCAST_LOAD;
+ };
+
+ // Fast-path: X ^ -1 -> ~X.
+ //
+ // Use X for all three VPTERNLOG inputs and select an immediate that yields
+ // ~X when A == B == C == X (imm bit0 = 1, bit7 = 0; other bits are don't
+ // care). This avoids introducing undef register operands.
+ //
+ // Keep this for register-like X only. For load-like X, this can cause an
+ // extra move/load before a folded-load VPTERNLOG form, which is usually not
+ // profitable.
+ if (N->getOpcode() == ISD::XOR) {
+ for (unsigned Idx = 0; Idx != 2; ++Idx) {
+ if (!IsAllOnesVec(N->getOperand(Idx)))
+ continue;
+
+ SDValue X = N->getOperand(Idx ^ 1);
+ SDValue XNoCast = PeelSingleUseBitcast(X);
+ if (IsLogicOpcode(XNoCast.getOpcode()) || IsAllOnesXor(XNoCast) ||
+ IsLoadLike(XNoCast))
+ continue;
+
+ if (matchVPTERNLOG(N, N, N, N, X, X, X, 0x01))
+ return true;
+ }
+ }
+
// Avoid consuming OR into a stand-alone VPTERNLOG if it is part of a
// higher-level A & ~(B | C) shape. Let the parent AND/ANDNP matcher absorb
// the whole pattern instead.
@@ -4866,6 +4911,26 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
}
}
+ // Avoid consuming xor(logic_op, -1) (i.e. NOT of a logic sub-tree) into a
+ // stand-alone VPTERNLOG when the parent is also a logic op. The parent's
+ // ComputeTernlog will fold the NOT directly into the truth-table immediate,
+ // producing fewer instructions overall.
+ if (N->getOpcode() == ISD::XOR && N->hasOneUse() &&
+ (ISD::isBuildVectorAllOnes(N->getOperand(0).getNode()) ||
+ ISD::isBuildVectorAllOnes(N->getOperand(1).getNode()))) {
+ SDValue Inner = ISD::isBuildVectorAllOnes(N->getOperand(1).getNode())
+ ? N->getOperand(0)
+ : N->getOperand(1);
+ SDValue InnerNoCast = PeelSingleUseBitcast(Inner);
+ if (IsLogicOpcode(InnerNoCast.getOpcode())) {
+ SDNode *User = *N->user_begin();
+ while (User->getOpcode() == ISD::BITCAST && User->hasOneUse())
+ User = *User->user_begin();
+ if (IsLogicOpcode(User->getOpcode()))
+ return false;
+ }
+ }
+
// Fast-path: A & ~(B | C) -> vpternlog(A, B, C, 0x10)
if (N->getOpcode() == ISD::AND) {
for (unsigned Idx = 0; Idx != 2; ++Idx) {
@@ -4894,8 +4959,8 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
};
auto ComputeTernlog = [&](SDValue Root, SDNode *OpaqueSubtree,
- SmallVectorImpl<LeafInfo> &Leaves,
- uint8_t &ImmOut, bool &TooManyLeaves) {
+ SmallVectorImpl<LeafInfo> &Leaves, uint8_t &ImmOut,
+ bool &TooManyLeaves) {
TooManyLeaves = false;
auto lookupLeaf = [&](SDValue Leaf) -> std::optional<uint8_t> {
@@ -4989,7 +5054,13 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
bool TooManyLeaves = false;
if (ComputeTernlog(SDValue(N, 0), /*OpaqueSubtree=*/nullptr, Leaves, Imm,
TooManyLeaves)) {
- if (Leaves.size() < 2)
+ if (Leaves.empty())
+ return false;
+ // A single-leaf load folded into VPTERNLOG causes a redundant explicit
+ // load (for the tied src1=dst) plus a folded load, doubling memory
+ // traffic. Bail out and let default lowering handle it (e.g.
+ // SETALLONES + VPXORQ mem for NOT-of-load).
+ if (Leaves.size() == 1 && IsLoadLike(PeelSingleUseBitcast(Leaves[0].Leaf)))
return false;
return EmitFromLeaves(N, Leaves, Imm);
}
@@ -4998,6 +5069,45 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
// input leaf, then fold the remaining logic around it. This allows
// multi-level trees to be selected as chained VPTERNLOG operations.
if (TooManyLeaves) {
+ bool IsAssocOp = N->getOpcode() == ISD::OR || N->getOpcode() == ISD::AND ||
+ N->getOpcode() == ISD::XOR;
+ bool IsHomogeneousAssoc = IsAssocOp && IsHomogeneousAssociativeTree(
+ SDValue(N, 0), N->getOpcode());
+
+ // For homogeneous associative trees, prefer choosing an opaque subtree
+ // from one level below the root-side logic node so we can expose two fresh
+ // siblings and form 3-input VPTERNLOG combines (e.g. OR reductions as
+ // repeated imm=254), rather than creating long 2-input $252/$250 chains.
+ if (IsHomogeneousAssoc) {
+ for (unsigned RootIdx = 0; RootIdx != 2; ++RootIdx) {
+ SDValue Side = PeelSingleUseBitcast(N->getOperand(RootIdx));
+ if (!Side.hasOneUse() || Side.getOpcode() != N->getOpcode())
+ continue;
+
+ for (unsigned ChildIdx = 0; ChildIdx != 2; ++ChildIdx) {
+ SDValue Child = PeelSingleUseBitcast(Side.getOperand(ChildIdx));
+ if (!Child.hasOneUse() || Child.getOpcode() != N->getOpcode())
+ continue;
+
+ SmallVector<LeafInfo, 3> AssocLeaves;
+ uint8_t AssocImm = 0;
+ bool AssocTooManyLeaves = false;
+ if (!ComputeTernlog(SDValue(N, 0), Child.getNode(), AssocLeaves,
+ AssocImm, AssocTooManyLeaves))
+ continue;
+
+ if (AssocLeaves.size() < 2)
+ continue;
+
+ if (EmitFromLeaves(N, AssocLeaves, AssocImm))
+ return true;
+ }
+ }
+ // Fall through to generic cascading — for balanced trees all
+ // grandchildren may be leaves, so the child-as-opaque strategy below
+ // can still produce a valid 3-input combine.
+ }
+
auto IsGoodOpaqueCandidate = [&](SDValue V) {
SDValue P = PeelSingleUseBitcast(V);
if (IsAllOnesXor(P))
@@ -5019,6 +5129,25 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
N->getOperand(CandidateOrder[1]).hasOneUse())
std::swap(CandidateOrder[0], CandidateOrder[1]);
+ // Collect all opaque subtree candidates: direct children and, if a direct
+ // child is itself a logic op, its grandchildren. Trying grandchildren
+ // allows the root VPTERNLOG to absorb more distinct operands (3 instead
+ // of 2), which produces tighter cascades. Example:
+ //
+ // xor With child "and" opaque: 2 leaves (and, e)
+ // / \ With grandchild "or" opaque: 3 leaves (or, d, e)
+ // and e → saves one instruction in the cascade.
+ // / \
+ // or d
+ //
+ // Try direct children first. Only explore grandchildren when all direct
+ // children produce ≤2 leaves (i.e. a degenerate 2-input fold that wastes
+ // a VPTERNLOG slot) AND the opaque subtree itself has >3 leaves, meaning
+ // a single VPTERNLOG cannot handle it. When the opaque child fits in one
+ // VPTERNLOG (≤3 leaves), going deeper just reshuffles the split without
+ // saving instructions.
+ bool TriedDirect = false;
+ bool NeedsGrandchild = false;
for (unsigned Idx : CandidateOrder) {
SmallVector<LeafInfo, 3> CascadedLeaves;
uint8_t CascadedImm = 0;
@@ -5029,11 +5158,91 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
CascadedImm, CascadedTooManyLeaves))
continue;
- if (CascadedLeaves.size() < 2)
+ if (CascadedLeaves.empty())
continue;
- if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
- return true;
+ // If the direct child yields a 3-leaf fold, emit it right away — this
+ // is already optimal for this level.
+ if (CascadedLeaves.size() == 3) {
+ if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+ return true;
+ }
+ TriedDirect = true;
+
+ // Check if the opaque subtree itself has >3 leaves: if so, it cannot
+ // be handled by a single VPTERNLOG, and going one level deeper may
+ // help reduce the total instruction count.
+ if (CascadedLeaves.size() <= 2) {
+ SmallVector<LeafInfo, 3> SubLeaves;
+ uint8_t SubImm = 0;
+ bool SubTooMany = false;
+ if (!ComputeTernlog(SDValue(N->getOperand(Idx)),
+ /*OpaqueSubtree=*/nullptr, SubLeaves, SubImm,
+ SubTooMany) &&
+ SubTooMany)
+ NeedsGrandchild = true;
+ }
+ }
+
+ // Direct children only yielded ≤2-leaf folds and the opaque subtree has
+ // >3 leaves (can't fit in one VPTERNLOG). Try grandchildren — making a
+ // deeper subtree opaque exposes more leaves at the root level, reducing
+ // the total instruction count.
+ if (NeedsGrandchild) {
+ for (unsigned Idx : CandidateOrder) {
+ SDValue Child = N->getOperand(Idx);
+ SDValue ChildNoCast = PeelSingleUseBitcast(Child);
+ if (!ChildNoCast.hasOneUse() || !IsLogicOpcode(ChildNoCast.getOpcode()))
+ continue;
+
+ for (unsigned GIdx = 0; GIdx != 2; ++GIdx) {
+ SDValue GChild = ChildNoCast.getOperand(GIdx);
+ SDValue GChildNoCast = PeelSingleUseBitcast(GChild);
+
+ // Skip NOT-wrappers (xor X, -1): ComputeTernlog already folds NOT
+ // into the truth table, so cutting at a NOT boundary just pushes
+ // the NOT into a separate instruction without saving anything.
+ if (IsAllOnesXor(GChildNoCast))
+ continue;
+
+ if (!IsLogicOpcode(GChildNoCast.getOpcode()))
+ continue;
+
+ SmallVector<LeafInfo, 3> CascadedLeaves;
+ uint8_t CascadedImm = 0;
+ bool CascadedTooManyLeaves = false;
+
+ if (!ComputeTernlog(SDValue(N, 0), GChild.getNode(), CascadedLeaves,
+ CascadedImm, CascadedTooManyLeaves))
+ continue;
+
+ if (CascadedLeaves.size() < 3)
+ continue;
+
+ if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+ return true;
+ }
+ }
+ }
+
+ // Fall back to direct-child opaque with ≤2 leaves if nothing else worked.
+ if (TriedDirect) {
+ for (unsigned Idx : CandidateOrder) {
+ SmallVector<LeafInfo, 3> CascadedLeaves;
+ uint8_t CascadedImm = 0;
+ bool CascadedTooManyLeaves = false;
+ SDNode *OpaqueSubtree = N->getOperand(Idx).getNode();
+
+ if (!ComputeTernlog(SDValue(N, 0), OpaqueSubtree, CascadedLeaves,
+ CascadedImm, CascadedTooManyLeaves))
+ continue;
+
+ if (CascadedLeaves.empty())
+ continue;
+
+ if (EmitFromLeaves(N, CascadedLeaves, CascadedImm))
+ return true;
+ }
}
}
diff --git a/llvm/test/CodeGen/X86/vpternlog.ll b/llvm/test/CodeGen/X86/vpternlog.ll
index 100d31883823d..4f9e384bd85a0 100644
--- a/llvm/test/CodeGen/X86/vpternlog.ll
+++ b/llvm/test/CodeGen/X86/vpternlog.ll
@@ -29,8 +29,7 @@ define <8 x i64> @foobar(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
; CHECK-LABEL: foobar:
; CHECK: # %bb.0:
; CHECK-NEXT: vpternlogq $208
-; CHECK-NEXT: vpternlogq $252
-; CHECK-NEXT: vpandnq
+; CHECK-NEXT: vpternlogq $16
; CHECK-NEXT: retq
%nb = xor <8 x i64> %b, splat (i64 -1)
%or = or <8 x i64> %nb, %c
@@ -97,3 +96,123 @@ define <8 x i64> @balanced_depth(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
%res = or <8 x i64> %l2, %r0
ret <8 x i64> %res
}
+
+define <8 x i64> @flip(ptr %x) {
+; CHECK-LABEL: flip:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpternlogq $85, (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %ld = load <8 x i64>, ptr %x, align 64
+ %not = xor <8 x i64> %ld, splat (i64 -1)
+ ret <8 x i64> %not
+}
+
+define dso_local <8 x i64> @fubar(<8 x i64> %0, <8 x i64> %1, <8 x i64> %2,
+ <8 x i64> %3, <8 x i64> %4, <8 x i64> %5,
+ <8 x i64> %6, <8 x i64> %7, <8 x i64> %8)
+ local_unnamed_addr {
+; CHECK-LABEL: fubar:
+; CHECK: # %bb.0:
+; CHECK: vpternlogq $254
+; CHECK: vpternlogq $254
+; CHECK: vpternlogq $254
+; CHECK: vpternlogq $254
+; CHECK-NOT: vpternlogq $252
+; CHECK-NOT: vpternlogq $250
+; CHECK: retq
+Entry:
+ %9 = or <8 x i64> %1, %0
+ %10 = or <8 x i64> %9, %2
+ %11 = or <8 x i64> %10, %3
+ %12 = or <8 x i64> %11, %4
+ %13 = or <8 x i64> %12, %5
+ %14 = or <8 x i64> %13, %6
+ %15 = or <8 x i64> %14, %7
+ %16 = or <8 x i64> %15, %8
+ ret <8 x i64> %16
+}
+
+define dso_local <8 x i64> @baz(<8 x i64> %0, <8 x i64> %1, <8 x i64> %2,
+ <8 x i64> %3, <8 x i64> %4, <8 x i64> %5,
+ <8 x i64> %6, <8 x i64> %7, <8 x i64> %8)
+ local_unnamed_addr {
+; CHECK-LABEL: baz:
+; CHECK: # %bb.0:
+; CHECK: vpternlogq $254
+; CHECK: vpternlogq $254
+; CHECK: vpternlogq $254
+; CHECK: vpternlogq $254
+; CHECK: retq
+Entry:
+ %9 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %0,
+ <8 x i64> %1,
+ <8 x i64> %2,
+ i32 254)
+ %10 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %3,
+ <8 x i64> %4,
+ <8 x i64> %5,
+ i32 254)
+ %11 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %6,
+ <8 x i64> %7,
+ <8 x i64> %8,
+ i32 254)
+ %12 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %9,
+ <8 x i64> %10,
+ <8 x i64> %11,
+ i32 254)
+ ret <8 x i64> %12
+}
+
+declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>,
+ <8 x i64>, i32 immarg)
+
+; 256-bit vector case — exercises VLX path.
+define <4 x i64> @foo_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) {
+; CHECK-LABEL: foo_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpternlogq $1
+; CHECK-NEXT: retq
+ %or1 = or <4 x i64> %b, %a
+ %or2 = or <4 x i64> %or1, %c
+ %not = xor <4 x i64> %or2, splat (i64 -1)
+ ret <4 x i64> %not
+}
+
+; 128-bit vector case — exercises VLX path.
+define <2 x i64> @foo_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
+; CHECK-LABEL: foo_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpternlogq $1
+; CHECK-NEXT: retq
+ %or1 = or <2 x i64> %b, %a
+ %or2 = or <2 x i64> %or1, %c
+ %not = xor <2 x i64> %or2, splat (i64 -1)
+ ret <2 x i64> %not
+}
+
+; Balanced OR tree — tests fallthrough from homogeneous-associative to generic.
+define <8 x i64> @balanced_or4(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c,
+ <8 x i64> %d) {
+; CHECK-LABEL: balanced_or4:
+; CHECK: # %bb.0:
+; CHECK: vpternlogq $254
+; CHECK-NOT: vporq
+; CHECK: retq
+ %t1 = or <8 x i64> %a, %b
+ %t2 = or <8 x i64> %c, %d
+ %res = or <8 x i64> %t1, %t2
+ ret <8 x i64> %res
+}
+
+; Multi-use operand — %a used in both ORs. hasOneUse() on the shared
+; value must not block the combine.
+define <8 x i64> @shared_operand(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
+; CHECK-LABEL: shared_operand:
+; CHECK: # %bb.0:
+; CHECK: vpternlogq
+; CHECK-NEXT: retq
+ %or1 = or <8 x i64> %a, %b
+ %or2 = or <8 x i64> %a, %c
+ %res = and <8 x i64> %or1, %or2
+ ret <8 x i64> %res
+}
More information about the llvm-commits
mailing list