[llvm] a631f3e - [X86] Fold vgf2p8affineqb XOR with splat constant into immediate (#179103)

Wed Feb 4 02:11:52 PST 2026

Author: bala-bhargav
Date: 2026-02-04T10:11:48Z
New Revision: a631f3e434b41471aa9f852f23bbc1d0ab4fa7bd

URL: https://github.com/llvm/llvm-project/commit/a631f3e434b41471aa9f852f23bbc1d0ab4fa7bd
DIFF: https://github.com/llvm/llvm-project/commit/a631f3e434b41471aa9f852f23bbc1d0ab4fa7bd.diff

LOG: [X86] Fold vgf2p8affineqb XOR with splat constant into immediate (#179103)

The vgf2p8affineqb instruction performs an affine transformation on each
byte and then XORs the result with an 8-bit immediate operand. When this
instruction is followed by a standalone XOR with a splatted constant,
LLVM currently generates extra instructions instead of folding the
constant into the instruction's immediate.
This PR adds a DAG combine optimization that detects the pattern
vgf2p8affineqb(x, m, imm8) ^ C where C is a splatted 8-bit constant and
transforms it to vgf2p8affineqb(x, m, imm8 ^ C), eliminating the
unnecessary XOR instruction.
- The optimization runs during the combine phase after type legalization
- Handles XOR with the constant on either side (commutative)
- Only applies when the GFNI instruction has a single use to avoid
de-optimization
- Validates that the XOR operand is a splatted 8-bit constant before
folding
- Includes test coverage for positive cases and negative cases
(multi-use, non-splat constant, variable XOR)

Added: 
    llvm/test/CodeGen/X86/gfni-xor-fold-avx512.ll
    llvm/test/CodeGen/X86/gfni-xor-fold.ll

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aec1a3361ed8a..e1aad028c14c0 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55864,6 +55864,32 @@ static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) {
   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
 }
 
+static SDValue combineXorWithGF2P8AFFINEQB(SDNode *N, const SDLoc &DL,
+                                           SelectionDAG &DAG, EVT VT) {
+  using namespace SDPatternMatch;
+
+  SDValue X, Y, SplatOp;
+  APInt Imm;
+  // Use sd_match for structure matching - m_Xor handles commutation
+  if (!sd_match(N, m_Xor(m_OneUse(m_TernaryOp(X86ISD::GF2P8AFFINEQB, m_Value(X),
+                                              m_Value(Y), m_ConstInt(Imm))),
+                         m_Value(SplatOp))))
+    return SDValue();
+
+  // GF2P8AFFINEQB only operates on i8 vector types
+  assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
+         "Unsupported GFNI type");
+
+  // Use X86::isConstantSplat for robust splat constant extraction
+  APInt SplatVal;
+  if (!X86::isConstantSplat(SplatOp, SplatVal, /*AllowPartialUndefs=*/false))
+    return SDValue();
+
+  uint64_t NewImm = Imm.getZExtValue() ^ SplatVal.getZExtValue();
+  return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, X, Y,
+                     DAG.getTargetConstant(NewImm, DL, MVT::i8));
+}
+
 static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
@@ -55966,6 +55992,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
 
   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
     return RV;
+  if (SDValue R = combineXorWithGF2P8AFFINEQB(N, DL, DAG, VT))
+    return R;
 
   // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();

diff  --git a/llvm/test/CodeGen/X86/gfni-xor-fold-avx512.ll b/llvm/test/CodeGen/X86/gfni-xor-fold-avx512.ll
new file mode 100644
index 0000000000000..bf5dd46579813
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gfni-xor-fold-avx512.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw,+gfni | FileCheck %s
+
+declare <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
+
+define <64 x i8> @test_affine_xor_fold_512(<64 x i8> %src1, <64 x i8> %src2) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_fold_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgf2p8affineqb $255, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 0)
+  %xor = xor <64 x i8> %gfni, splat(i8 -1)
+  ret <64 x i8> %xor
+}
+
+define <64 x i8> @test_affine_xor_fold_512_nonzero_imm(<64 x i8> %src1, <64 x i8> %src2) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_fold_512_nonzero_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgf2p8affineqb $175, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 5)
+  %xor = xor <64 x i8> %gfni, splat(i8 -86)
+  ret <64 x i8> %xor
+}
+
+define <64 x i8> @test_affine_xor_fold_512_commutative(<64 x i8> %src1, <64 x i8> %src2) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_fold_512_commutative:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgf2p8affineqb $255, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 0)
+  %xor = xor <64 x i8> splat(i8 -1), %gfni
+  ret <64 x i8> %xor
+}
+
+define <64 x i8> @test_affine_xor_no_fold_512_multi_use(<64 x i8> %src1, <64 x i8> %src2, ptr %out) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_no_fold_512_multi_use:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgf2p8affineqb $0, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
+; CHECK-NEXT:    retq
+  %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 0)
+  store <64 x i8> %gfni, ptr %out
+  %xor = xor <64 x i8> %gfni, splat(i8 -1)
+  ret <64 x i8> %xor
+}
+
+define <64 x i8> @test_affine_xor_no_fold_512_variable(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %var) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_no_fold_512_variable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgf2p8affineqb $0, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 0)
+  %xor = xor <64 x i8> %gfni, %var
+  ret <64 x i8> %xor
+}

diff  --git a/llvm/test/CodeGen/X86/gfni-xor-fold.ll b/llvm/test/CodeGen/X86/gfni-xor-fold.ll
new file mode 100644
index 0000000000000..510f4cefa4e57
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gfni-xor-fold.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni,+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+gfni,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+
+define <16 x i8> @test_affine_xor_fold_128(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  %xor = xor <16 x i8> %gfni, splat(i8 -1)
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_nonzero_imm(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_nonzero_imm:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_nonzero_imm:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5)
+  %xor = xor <16 x i8> %gfni, splat(i8 -86)
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_hex(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_hex:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_hex:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 17)
+  %xor = xor <16 x i8> %gfni, splat(i8 66)
+  ret <16 x i8> %xor
+}
+
+define <32 x i8> @test_affine_xor_fold_256(<32 x i8> %src1, <32 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %gfni = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 0)
+  %xor = xor <32 x i8> %gfni, splat(i8 -1)
+  ret <32 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_commutative(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_commutative:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_fold_commutative:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  %xor = xor <16 x i8> splat(i8 -1), %gfni
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_multi_use(<16 x i8> %src1, <16 x i8> %src2, ptr %out) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_multi_use:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_multi_use:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  store <16 x i8> %gfni, ptr %out
+  %xor = xor <16 x i8> %gfni, splat(i8 -1)
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_non_splat(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_non_splat:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_non_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  %xor = xor <16 x i8> %gfni, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>
+  ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_variable(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %var) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_variable:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_variable:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+  %xor = xor <16 x i8> %gfni, %var
+  ret <16 x i8> %xor
+}