[llvm] [X86] Fold vgf2p8affineqb XOR with splat constant into immediate (PR #179103)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 4 01:39:43 PST 2026
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/179103
>From b36941629f31de49e8d55b5191f811dd99ea43bf Mon Sep 17 00:00:00 2001
From: bhargav <penugondabalabharghav at gmail.com>
Date: Sun, 1 Feb 2026 17:43:29 +0530
Subject: [PATCH] [X86] Fold vgf2p8affineqb XOR with splat constant into
immediate
The vgf2p8affineqb instruction applies an XOR with its immediate operand
after the affine transformation. When followed by a standalone XOR with
a splatted 8-bit constant, this can be folded into the instruction itself
by updating the immediate: vgf2p8affineqb(x, m, imm8) ^ C => vgf2p8affineqb(x, m, imm8 ^ C)
This eliminates unnecessary XOR instructions and improves code size.
Fixes #178795
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 28 ++++
llvm/test/CodeGen/X86/gfni-xor-fold-avx512.ll | 63 ++++++++
llvm/test/CodeGen/X86/gfni-xor-fold.ll | 144 ++++++++++++++++++
3 files changed, 235 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/gfni-xor-fold-avx512.ll
create mode 100644 llvm/test/CodeGen/X86/gfni-xor-fold.ll
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ca73b336225f2..f402ae725da19 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55821,6 +55821,32 @@ static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) {
return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
}
+static SDValue combineXorWithGF2P8AFFINEQB(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG, EVT VT) {
+ using namespace SDPatternMatch;
+
+ SDValue X, Y, SplatOp;
+ APInt Imm;
+ // Use sd_match for structure matching - m_Xor handles commutation
+ if (!sd_match(N, m_Xor(m_OneUse(m_TernaryOp(X86ISD::GF2P8AFFINEQB, m_Value(X),
+ m_Value(Y), m_ConstInt(Imm))),
+ m_Value(SplatOp))))
+ return SDValue();
+
+ // GF2P8AFFINEQB only operates on i8 vector types
+ assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
+ "Unsupported GFNI type");
+
+ // Use X86::isConstantSplat for robust splat constant extraction
+ APInt SplatVal;
+ if (!X86::isConstantSplat(SplatOp, SplatVal, /*AllowPartialUndefs=*/false))
+ return SDValue();
+
+ uint64_t NewImm = Imm.getZExtValue() ^ SplatVal.getZExtValue();
+ return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, X, Y,
+ DAG.getTargetConstant(NewImm, DL, MVT::i8));
+}
+
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
@@ -55923,6 +55949,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
return RV;
+ if (SDValue R = combineXorWithGF2P8AFFINEQB(N, DL, DAG, VT))
+ return R;
// Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
diff --git a/llvm/test/CodeGen/X86/gfni-xor-fold-avx512.ll b/llvm/test/CodeGen/X86/gfni-xor-fold-avx512.ll
new file mode 100644
index 0000000000000..bf5dd46579813
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gfni-xor-fold-avx512.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw,+gfni | FileCheck %s
+
+declare <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
+
+define <64 x i8> @test_affine_xor_fold_512(<64 x i8> %src1, <64 x i8> %src2) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_fold_512:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgf2p8affineqb $255, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 0)
+ %xor = xor <64 x i8> %gfni, splat(i8 -1)
+ ret <64 x i8> %xor
+}
+
+define <64 x i8> @test_affine_xor_fold_512_nonzero_imm(<64 x i8> %src1, <64 x i8> %src2) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_fold_512_nonzero_imm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgf2p8affineqb $175, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 5)
+ %xor = xor <64 x i8> %gfni, splat(i8 -86)
+ ret <64 x i8> %xor
+}
+
+define <64 x i8> @test_affine_xor_fold_512_commutative(<64 x i8> %src1, <64 x i8> %src2) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_fold_512_commutative:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgf2p8affineqb $255, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 0)
+ %xor = xor <64 x i8> splat(i8 -1), %gfni
+ ret <64 x i8> %xor
+}
+
+define <64 x i8> @test_affine_xor_no_fold_512_multi_use(<64 x i8> %src1, <64 x i8> %src2, ptr %out) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_no_fold_512_multi_use:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgf2p8affineqb $0, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi)
+; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0
+; CHECK-NEXT: retq
+ %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 0)
+ store <64 x i8> %gfni, ptr %out
+ %xor = xor <64 x i8> %gfni, splat(i8 -1)
+ ret <64 x i8> %xor
+}
+
+define <64 x i8> @test_affine_xor_no_fold_512_variable(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %var) nounwind {
+;
+; CHECK-LABEL: test_affine_xor_no_fold_512_variable:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vgf2p8affineqb $0, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpxorq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %gfni = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 0)
+ %xor = xor <64 x i8> %gfni, %var
+ ret <64 x i8> %xor
+}
diff --git a/llvm/test/CodeGen/X86/gfni-xor-fold.ll b/llvm/test/CodeGen/X86/gfni-xor-fold.ll
new file mode 100644
index 0000000000000..510f4cefa4e57
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gfni-xor-fold.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni,+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+gfni,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+
+define <16 x i8> @test_affine_xor_fold_128(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_128:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ %xor = xor <16 x i8> %gfni, splat(i8 -1)
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_nonzero_imm(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_nonzero_imm:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_nonzero_imm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5)
+ %xor = xor <16 x i8> %gfni, splat(i8 -86)
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_hex(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_hex:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_hex:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 17)
+ %xor = xor <16 x i8> %gfni, splat(i8 66)
+ ret <16 x i8> %xor
+}
+
+define <32 x i8> @test_affine_xor_fold_256(<32 x i8> %src1, <32 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_256:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %gfni = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 0)
+ %xor = xor <32 x i8> %gfni, splat(i8 -1)
+ ret <32 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_commutative(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_commutative:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_commutative:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ %xor = xor <16 x i8> splat(i8 -1), %gfni
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_multi_use(<16 x i8> %src1, <16 x i8> %src2, ptr %out) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_multi_use:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_multi_use:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ store <16 x i8> %gfni, ptr %out
+ %xor = xor <16 x i8> %gfni, splat(i8 -1)
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_non_splat(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_non_splat:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_non_splat:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ %xor = xor <16 x i8> %gfni, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_variable(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %var) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_variable:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_variable:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ %xor = xor <16 x i8> %gfni, %var
+ ret <16 x i8> %xor
+}
More information about the llvm-commits
mailing list