[llvm] [WIP][X86] Use GFNI for vXi8 per-element shifts (PR #89644)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 22 11:12:04 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h
These are a bit more complicated than gf2p8affine look ups, requiring us to convert a SHL shift value / amount into a GF so we can perform a multiplication. SRL/SRA need to be converted to SHL via bitreverse/variable-sign-extension.
Followup to #<!-- -->89115
CC @<!-- -->shamithoke
---
Patch is 231.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89644.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+65)
- (modified) llvm/test/CodeGen/X86/gfni-funnel-shifts.ll (+409-776)
- (modified) llvm/test/CodeGen/X86/gfni-shifts.ll (+961-1635)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dd40d079c7e2f7..b8e5255bc56c55 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29564,6 +29564,62 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
DAG.getNode(Opc, dl, ExtVT, R, Amt));
}
+ // GFNI - we can perform SHL with a GF multiplication, and can convert
+ // SRL/SRA to a SHL.
+ if (VT == MVT::v16i8 ||
+ (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) {
+ if (Subtarget.hasGFNI() && Subtarget.hasSSSE3()) {
+ auto GFShiftLeft = [&](SDValue Val) {
+ // Use PSHUFB as a LUT from the shift amount to create a per-element
+ // byte mask for the shift value and an index. For shift amounts greater
+ // than 7, the result will be zero.
+ SmallVector<APInt, 8> MaskBits, IdxBits;
+ for (unsigned I = 0, E = VT.getSizeInBits() / 128; I != E; ++I) {
+ MaskBits.push_back(APInt(64, 0x0103070F1F3F7FFFULL));
+ IdxBits.push_back(APInt(64, 0x8040201008040201ULL));
+ MaskBits.push_back(APInt::getZero(64));
+ IdxBits.push_back(APInt::getZero(64));
+ }
+
+ MVT CVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ SDValue Mask =
+ DAG.getBitcast(VT, getConstVector(MaskBits, CVT, DAG, dl));
+ SDValue Idx = DAG.getBitcast(VT, getConstVector(IdxBits, CVT, DAG, dl));
+ Mask = DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
+ Idx = DAG.getNode(X86ISD::PSHUFB, dl, VT, Idx, Amt);
+ Mask = DAG.getNode(ISD::AND, dl, VT, Val, Mask);
+ return DAG.getNode(X86ISD::GF2P8MULB, dl, VT, Mask, Idx);
+ };
+
+ if (Opc == ISD::SHL)
+ return GFShiftLeft(R);
+
+ // srl(x,y)
+ // --> bitreverse(shl(bitreverse(x),y))
+ if (Opc == ISD::SRL) {
+ R = DAG.getNode(ISD::BITREVERSE, dl, VT, R);
+ R = GFShiftLeft(R);
+ return DAG.getNode(ISD::BITREVERSE, dl, VT, R);
+ }
+
+ // sra(x,y)
+ // --> sub(xor(srl(x,y), m),m)
+ // --> sub(xor(bitreverse(shl(bitreverse(x),y)), m),m)
+ // where m = srl(signbit, amt) --> bitreverse(shl(lsb, amt))
+ if (Opc == ISD::SRA) {
+ SDValue LSB = DAG.getConstant(APInt::getOneBitSet(8, 0), dl, VT);
+ SDValue M = DAG.getNode(ISD::BITREVERSE, dl, VT, GFShiftLeft(LSB));
+ R = DAG.getNode(ISD::BITREVERSE, dl, VT, R);
+ R = GFShiftLeft(R);
+ R = DAG.getNode(ISD::BITREVERSE, dl, VT, R);
+ R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+ R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+ return R;
+ }
+ }
+ }
+
// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
@@ -55614,6 +55670,15 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ConcatSubOperand(VT, Ops, 0));
}
break;
+ case X86ISD::GF2P8MULB:
+ if (!IsSplat &&
+ (VT.is256BitVector() ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ ConcatSubOperand(VT, Ops, 0),
+ ConcatSubOperand(VT, Ops, 1));
+ }
+ break;
case X86ISD::GF2P8AFFINEQB:
if (!IsSplat &&
(VT.is256BitVector() ||
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 0c341dc63a9ecc..f22df047d15b7d 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -142,114 +142,71 @@ define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou
define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_fshr_v16i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: movdqa %xmm0, %xmm2
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: pand %xmm5, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: movdqa %xmm0, %xmm4
-; GFNISSE-NEXT: paddb %xmm0, %xmm4
-; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: psrlw $4, %xmm6
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: psrlw $2, %xmm6
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: psrlw $1, %xmm6
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; GFNISSE-NEXT: paddb %xmm4, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
-; GFNISSE-NEXT: pandn %xmm5, %xmm3
-; GFNISSE-NEXT: psllw $5, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm4
-; GFNISSE-NEXT: paddb %xmm3, %xmm4
-; GFNISSE-NEXT: paddb %xmm2, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm5
-; GFNISSE-NEXT: psllw $4, %xmm5
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: psllw $2, %xmm3
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: paddb %xmm2, %xmm3
-; GFNISSE-NEXT: paddb %xmm4, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT: por %xmm1, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: movdqa %xmm2, %xmm4
+; GFNISSE-NEXT: pandn %xmm3, %xmm4
+; GFNISSE-NEXT: movq {{.*#+}} xmm5 = [9241421688590303745,0]
+; GFNISSE-NEXT: movdqa %xmm5, %xmm6
+; GFNISSE-NEXT: pshufb %xmm4, %xmm6
+; GFNISSE-NEXT: movq {{.*#+}} xmm7 = [72909780498219007,0]
+; GFNISSE-NEXT: movdqa %xmm7, %xmm8
+; GFNISSE-NEXT: pshufb %xmm4, %xmm8
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pand %xmm8, %xmm0
+; GFNISSE-NEXT: gf2p8mulb %xmm6, %xmm0
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE-NEXT: pand %xmm3, %xmm2
+; GFNISSE-NEXT: pshufb %xmm2, %xmm7
+; GFNISSE-NEXT: pand %xmm1, %xmm7
+; GFNISSE-NEXT: pshufb %xmm2, %xmm5
+; GFNISSE-NEXT: gf2p8mulb %xmm7, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm5
+; GFNISSE-NEXT: por %xmm5, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: var_fshr_v16i8:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; GFNIAVX1-NEXT: vpsllw $5, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
-; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm6
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; GFNIAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; GFNIAVX1-NEXT: vmovq {{.*#+}} xmm5 = [9241421688590303745,0]
+; GFNIAVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm6
+; GFNIAVX1-NEXT: vmovq {{.*#+}} xmm7 = [72909780498219007,0]
+; GFNIAVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm4
; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm2
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vgf2p8mulb %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm3
+; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; GFNIAVX1-NEXT: vgf2p8mulb %xmm2, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX1-NEXT: retq
;
; GFNIAVX2-LABEL: var_fshr_v16i8:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; GFNIAVX2-NEXT: vpsllw $5, %xmm4, %xmm4
-; GFNIAVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5
-; GFNIAVX2-NEXT: vpsrlw $4, %xmm1, %xmm6
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
-; GFNIAVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; GFNIAVX2-NEXT: vpsrlw $2, %xmm1, %xmm4
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX2-NEXT: vpsrlw $1, %xmm1, %xmm4
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; GFNIAVX2-NEXT: vpsllw $5, %xmm2, %xmm2
-; GFNIAVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; GFNIAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; GFNIAVX2-NEXT: vmovq {{.*#+}} xmm5 = [9241421688590303745,0]
+; GFNIAVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm6
+; GFNIAVX2-NEXT: vmovq {{.*#+}} xmm7 = [72909780498219007,0]
+; GFNIAVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4
; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX2-NEXT: vpsllw $4, %xmm0, %xmm4
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
-; GFNIAVX2-NEXT: vpsllw $2, %xmm0, %xmm2
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
-; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; GFNIAVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; GFNIAVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vgf2p8mulb %xmm6, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; GFNIAVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm3
+; GFNIAVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; GFNIAVX2-NEXT: vgf2p8mulb %xmm2, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX2-NEXT: retq
;
@@ -719,34 +676,25 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
; GFNIAVX512VL-LABEL: var_fshl_v32i8:
; GFNIAVX512VL: # %bb.0:
; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [9241421688590303745,0,9241421688590303745,0]
+; GFNIAVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT: vpshufb %ymm4, %ymm5, %ymm6
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [72909780498219007,0,72909780498219007,0]
+; GFNIAVX512VL-NEXT: # ymm7 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT: vpshufb %ymm4, %ymm7, %ymm4
+; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vgf2p8mulb %ymm6, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpshufb %ymm2, %ymm7, %ymm3
; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
-; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; GFNIAVX512VL-NEXT: vgf2p8mulb %ymm2, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: retq
;
@@ -769,219 +717,133 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_fshr_v32i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm4, %xmm6
-; GFNISSE-NEXT: movdqa %xmm0, %xmm4
-; GFNISSE-NEXT: movdqa %xmm2, %xmm9
-; GFNISSE-NEXT: psrlw $4, %xmm9
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNISSE-NEXT: pand %xmm8, %xmm9
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNISSE-NEXT: movdqa %xmm6, %xmm0
-; GFNISSE-NEXT: pand %xmm7, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm9, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm10
-; GFNISSE-NEXT: psrlw $2, %xmm10
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNISSE-NEXT: pand %xmm9, %xmm10
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm11
-; GFNISSE-NEXT: psrlw $1, %xmm11
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNISSE-NEXT: pand %xmm10, %xmm11
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm2
-; GFNISSE-NEXT: paddb %xmm4, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm12
-; GFNISSE-NEXT: psllw $4, %xmm12
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT: pand %xmm11, %xmm12
-; GFNISSE-NEXT: pandn %xmm7, %xmm6
-; GFNISSE-NEXT: psllw $5, %xmm6
-; GFNISSE-NEXT: movdqa %xmm6, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm13
-; GFNISSE-NEXT: psllw $2, %xmm13
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT: pand %xmm12, %xmm13
-; GFNISSE-NEXT: paddb %xmm6, %xmm6
-; GFNISSE-NEXT: movdqa %xmm6, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm13
-; GFNISSE-NEXT: paddb %xmm4, %xmm13
-; GFNISSE-NEXT: paddb %xmm6, %xmm6
-; GFNISSE-NEXT: movdqa %xmm6, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4
-; GFNISSE-NEXT: por %xmm2, %xmm4
-; GFNISSE-NEXT: movdqa %xmm3, %xmm2
-; GFNISSE-NEXT: psrlw $4, %xmm2
-; GFNISSE-NEXT: pand %xmm8, %xmm2
-; GFNISSE-NEXT: movdqa %xmm5, %xmm0
-; GFNISSE-NEXT: pand %xmm7, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm2
-; GFNISSE-NEXT: psrlw $2, %xmm2
-; GFNISSE-NEXT: pand %xmm9, %xmm2
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm2
-; GFNISSE-NEXT: psrlw $1, %xmm2
-; GFNISSE-NEXT: pand %xmm10, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: movdqa %xmm4, %xmm9
+; GFNISSE-NEXT: pandn %xmm8, %xmm9
+; GFNISSE-NEXT: movq {{.*#+}} xmm6 = [9241421688590303745,0]
+; GFNISSE-NEXT: movdqa %xmm6, %xmm10
+; GFNISSE-NEXT: pshufb %xmm9, %xmm10
+; GFNISSE-NEXT: movq {{.*#+}} xmm7 = [72909780498219007,0]
+; GFNISSE-NEXT: movdqa %xmm7, %xmm11
+; GFNISSE-NEXT: pshufb %xmm9, %xmm11
; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
+; GFNISSE-NEXT: pand %xmm11, %xmm0
+; GFNISSE-NEXT: gf2p8mulb %xmm10, %xmm0
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm2
+; GFNISSE-NEXT: pand %xmm8, %xmm4
+; GFNISSE-NEXT: movdqa %xmm7, %xmm10
+; GFNISSE-NEXT: pshufb %xmm4, %xmm10
+; GFNISSE-NEXT: pand %xmm2, %xmm10
+; GFNISSE-NEXT: movdqa %xmm6, %xmm2
+; GFNISSE-NEXT: pshufb %xmm4, %xmm2
+; GFNISSE-NEXT: gf2p8mulb %xmm10, %xmm2
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm2
+; GFNISSE-NEXT: por %xmm2, %xmm0
+; GFNISSE-NEXT: movdqa %xmm5, %xmm2
+; GFNISSE-NEXT: pandn %xmm8, %xmm2
+; GFNISSE-NEXT: movdqa %xmm6, %xmm4
+; GFNISSE-NEXT: pshufb %xmm2, %xmm4
+; GFNISSE-NEXT: movdqa %xmm7, %xmm10
+; GFNISSE-NEXT: pshufb %xmm2, %xmm10
; GFNISSE-NEXT: paddb %xmm1, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: psllw $4, %xmm2
-; GFNISSE-NEXT: pand %xmm11, %xmm2
-; GFNISSE-NEXT: pandn %xmm7, %xmm5
-; GFNISSE-NEXT: psllw $5, %xmm5
-; GFNISSE-NEXT: movdqa %xmm5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: psllw $2, %xmm2
-; GFNISSE-NEXT: pand %xmm12, %xmm2
-; GFNISSE-NEXT: paddb %xmm5, %xmm5
-; GFNISSE-NEXT: movdqa %xmm5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: paddb %xmm1, %xmm2
-; GFNISSE-NEXT: paddb %xmm5, %xmm5
-; GFNISSE-NEXT: movdqa %xmm5, %xmm0
-; GFNISSE-NEXT: pb...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/89644
More information about the llvm-commits
mailing list