[llvm] [X86] Fold vgf2p8affineqb XOR with splat constant into immediate (PR #179103)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 1 04:15:45 PST 2026
https://github.com/bala-bhargav created https://github.com/llvm/llvm-project/pull/179103
Fold `vgf2p8affineqb(x, m, imm8) ^ C` => `vgf2p8affineqb(x, m, imm8 ^ C)` where C is a splatted 8-bit constant.
The `vgf2p8affineqb` instruction applies an XOR with its immediate operand after the affine transformation. When followed by a standalone XOR with a splatted 8-bit constant, this can be folded into the instruction itself by updating the immediate.
>From aa777c319048e059e911a55d91ebc87d5460a827 Mon Sep 17 00:00:00 2001
From: bhargav <penugondabalabharghav at gmail.com>
Date: Sun, 1 Feb 2026 17:43:29 +0530
Subject: [PATCH] [X86] Fold vgf2p8affineqb XOR with splat constant into
immediate
The vgf2p8affineqb instruction applies an XOR with its immediate operand
after the affine transformation. When followed by a standalone XOR with
a splatted 8-bit constant, this can be folded into the instruction itself
by updating the immediate: vgf2p8affineqb(x, m, imm8) ^ C => vgf2p8affineqb(x, m, imm8 ^ C)
This eliminates unnecessary XOR instructions and improves code size.
Fixes #178795
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 55 +++++++++
llvm/test/CodeGen/X86/gfni-xor-fold.ll | 144 ++++++++++++++++++++++++
2 files changed, 199 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/gfni-xor-fold.ll
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ca73b336225f2..422b59831f8e4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55821,6 +55821,59 @@ static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) {
return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
}
+static SDValue combineXorWithGF2P8AFFINEQB(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (N0.getOpcode() != X86ISD::GF2P8AFFINEQB) {
+ if (N1.getOpcode() != X86ISD::GF2P8AFFINEQB)
+ return SDValue();
+ std::swap(N0, N1);
+ }
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+ if (ScalarSizeInBits != 8)
+ return SDValue();
+
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT.getSimpleVT()))
+ return SDValue();
+
+ APInt UndefElts;
+ SmallVector<APInt, 16> EltBits;
+ if (!getTargetConstantBitsFromNode(N1, ScalarSizeInBits, UndefElts, EltBits,
+ true,
+ false))
+ return SDValue();
+
+ std::optional<APInt> SplatVal;
+ for (unsigned I = 0, E = EltBits.size(); I < E; I++) {
+ if (UndefElts[I])
+ continue;
+ if (!SplatVal)
+ SplatVal = EltBits[I];
+ else if (*SplatVal != EltBits[I])
+ return SDValue();
+ }
+
+ if (!SplatVal)
+ return SDValue();
+
+ auto *OldImmNode = dyn_cast<ConstantSDNode>(N0.getOperand(2));
+ if (!OldImmNode)
+ return SDValue();
+
+ uint64_t NewImm = OldImmNode->getZExtValue() ^ SplatVal->getZExtValue();
+ SDLoc DL(N);
+ return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, N0.getValueType(), N0.getOperand(0),
+ N0.getOperand(1),
+ DAG.getTargetConstant(NewImm, DL, MVT::i8));
+}
+
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
@@ -55923,6 +55976,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
return RV;
+ if (SDValue R = combineXorWithGF2P8AFFINEQB(N, DAG, Subtarget))
+ return R;
// Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
diff --git a/llvm/test/CodeGen/X86/gfni-xor-fold.ll b/llvm/test/CodeGen/X86/gfni-xor-fold.ll
new file mode 100644
index 0000000000000..510f4cefa4e57
--- /dev/null
+++ b/llvm/test/CodeGen/X86/gfni-xor-fold.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni,+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+gfni,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+
+define <16 x i8> @test_affine_xor_fold_128(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_128:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ %xor = xor <16 x i8> %gfni, splat(i8 -1)
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_nonzero_imm(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_nonzero_imm:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_nonzero_imm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $175, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5)
+ %xor = xor <16 x i8> %gfni, splat(i8 -86)
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_hex(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_hex:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_hex:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $83, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 17)
+ %xor = xor <16 x i8> %gfni, splat(i8 66)
+ ret <16 x i8> %xor
+}
+
+define <32 x i8> @test_affine_xor_fold_256(<32 x i8> %src1, <32 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_256:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $255, %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %gfni = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 0)
+ %xor = xor <32 x i8> %gfni, splat(i8 -1)
+ ret <32 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_fold_commutative(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_fold_commutative:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_fold_commutative:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $255, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ %xor = xor <16 x i8> splat(i8 -1), %gfni
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_multi_use(<16 x i8> %src1, <16 x i8> %src2, ptr %out) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_multi_use:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_multi_use:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ store <16 x i8> %gfni, ptr %out
+ %xor = xor <16 x i8> %gfni, splat(i8 -1)
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_non_splat(<16 x i8> %src1, <16 x i8> %src2) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_non_splat:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_non_splat:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ %xor = xor <16 x i8> %gfni, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>
+ ret <16 x i8> %xor
+}
+
+define <16 x i8> @test_affine_xor_no_fold_variable(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %var) nounwind {
+;
+; AVX-LABEL: test_affine_xor_no_fold_variable:
+; AVX: # %bb.0:
+; AVX-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_affine_xor_no_fold_variable:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %gfni = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 0)
+ %xor = xor <16 x i8> %gfni, %var
+ ret <16 x i8> %xor
+}
More information about the llvm-commits
mailing list