[llvm] [X86] Fold vgf2p8affineqb XOR with splat constant into immediate (PR #179103)

Sun Feb 1 04:15:45 PST 2026

https://github.com/bala-bhargav created https://github.com/llvm/llvm-project/pull/179103

Fold `vgf2p8affineqb(x, m, imm8) ^ C` => `vgf2p8affineqb(x, m, imm8 ^ C)` where C is a splatted 8-bit constant.
The `vgf2p8affineqb` instruction applies an XOR with its immediate operand after the affine transformation. When followed by a standalone XOR with a splatted 8-bit constant, this can be folded into the instruction itself by updating the immediate.

>From aa777c319048e059e911a55d91ebc87d5460a827 Mon Sep 17 00:00:00 2001
From: bhargav <penugondabalabharghav at gmail.com>
Date: Sun, 1 Feb 2026 17:43:29 +0530
Subject: [PATCH] [X86] Fold vgf2p8affineqb XOR with splat constant into
 immediate

The vgf2p8affineqb instruction applies an XOR with its immediate operand
after the affine transformation. When followed by a standalone XOR with
a splatted 8-bit constant, this can be folded into the instruction itself
by updating the immediate: vgf2p8affineqb(x, m, imm8) ^ C => vgf2p8affineqb(x, m, imm8 ^ C)

This eliminates unnecessary XOR instructions and improves code size.

Fixes #178795
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  55 +++++++++
 llvm/test/CodeGen/X86/gfni-xor-fold.ll  | 144 ++++++++++++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/gfni-xor-fold.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ca73b336225f2..422b59831f8e4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55821,6 +55821,59 @@ static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) {
   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
 }
 
+static SDValue combineXorWithGF2P8AFFINEQB(SDNode *N, SelectionDAG &DAG,
+                                           const X86Subtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (N0.getOpcode() != X86ISD::GF2P8AFFINEQB) {
+    if (N1.getOpcode() != X86ISD::GF2P8AFFINEQB)
+      return SDValue();
+    std::swap(N0, N1);
+  }
+
+  if (!N0.hasOneUse())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+  if (ScalarSizeInBits != 8)
+    return SDValue();
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT.getSimpleVT()))
+    return SDValue();
+
+  APInt UndefElts;
+  SmallVector<APInt, 16> EltBits;
+  if (!getTargetConstantBitsFromNode(N1, ScalarSizeInBits, UndefElts, EltBits,
+                                     true,
+                                     false))
+    return SDValue();
+
+  std::optional<APInt> SplatVal;
+  for (unsigned I = 0, E = EltBits.size(); I < E; I++) {
+    if (UndefElts[I])
+      continue;
+    if (!SplatVal)
+      SplatVal = EltBits[I];
+    else if (*SplatVal != EltBits[I])
+      return SDValue();
+  }
+
+  if (!SplatVal)
+    return SDValue();
+
+  auto *OldImmNode = dyn_cast<ConstantSDNode>(N0.getOperand(2));
+  if (!OldImmNode)
+    return SDValue();
+
+  uint64_t NewImm = OldImmNode->getZExtValue() ^ SplatVal->getZExtValue();
+  SDLoc DL(N);
+  return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, N0.getValueType(), N0.getOperand(0),
+                     N0.getOperand(1),
+                     DAG.getTargetConstant(NewImm, DL, MVT::i8));
+}
+
 static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
@@ -55923,6 +55976,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
 
   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
     return RV;
+  if (SDValue R = combineXorWithGF2P8AFFINEQB(N, DAG, Subtarget))
+    return R;
 
   // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
diff --git a/llvm/test/CodeGen/X86/gfni-xor-fold.ll b/llvm/test/CodeGen/X86/gfni-xor-fold.ll
new file mode 100644
index 0000000000000..510f4cefa4e57
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gfni-xor-fold.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni,+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+gfni,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+
+define <16 x i8> @test_affine_xor_fold_128(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  %xor = xor <16 x i8> %gfni, splat(i8 -1)
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_nonzero_imm(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_nonzero_imm:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_nonzero_imm:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5)
+  %xor = xor <16 x i8> %gfni, splat(i8 -86)
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_hex(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_hex:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_hex:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 17)
+  %xor = xor <16 x i8> %gfni, splat(i8 66)
+  ret <16 x i8> %xor
+}
+
+define <32 x i8> @test_affine_xor_fold_256(<32 x i8> %src1, <32 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %gfni = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 0)
+  %xor = xor <32 x i8> %gfni, splat(i8 -1)
+  ret <32 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_commutative(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_commutative:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_commutative:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  %xor = xor <16 x i8> splat(i8 -1), %gfni
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_multi_use(<16 x i8> %src1, <16 x i8> %src2, ptr %out) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_multi_use:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_multi_use:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  store <16 x i8> %gfni, ptr %out
+  %xor = xor <16 x i8> %gfni, splat(i8 -1)
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_non_splat(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_non_splat:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_non_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  %xor = xor <16 x i8> %gfni, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_variable(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %var) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_variable:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_variable:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  %xor = xor <16 x i8> %gfni, %var
+  ret <16 x i8> %xor
+}