[llvm] [X86] LowerRotate - expand vXi8 non-uniform variable rotates using uniform constant rotates (PR #189986)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 08:58:12 PDT 2026
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/189986
We expand vXi8 non-uniform variable rotates as a sequence of uniform constant rotates along with a SELECT depending on whether the original rotate amount needs it
This patch removes premature uniform constant rotate expansion to the OR(SHL,SRL) sequences to allow GFNI targets to use single VGF2P8AFFINEQB calls
>From 551f516fc24918c63f7e24a6bd25e4ce268f1331 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 1 Apr 2026 16:29:20 +0100
Subject: [PATCH] [X86] LowerRotate - expand vXi8 non-uniform variable rotates
using uniform constant rotates
We expand vXi8 non-uniform variable rotates as a sequence of uniform constant rotates along with a SELECT depending on whether the original rotate amount needs it
This patch removes premature uniform constant rotate expansion to the OR(SHL,SRL) sequences to allow GFNI targets to use single VGF2P8AFFINEQB calls
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 29 +-
llvm/test/CodeGen/X86/gfni-rotates.ll | 894 ++++++-------------
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 24 +-
llvm/test/CodeGen/X86/vector-fshr-rot-512.ll | 60 +-
4 files changed, 337 insertions(+), 670 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e1a7876e30de0..d1de545f86b6a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32177,14 +32177,16 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getSelect(DL, SelVT, C, V0, V1);
};
- // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
- if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
+ // ISD::ROTR is currently only profitable on GFNI/AVX512+VPTERNLOG targets.
+ if (!IsROTL && !useVPTERNLOG(Subtarget, VT) && !Subtarget.hasGFNI()) {
Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
IsROTL = true;
}
- unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
- unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
+ auto BuildRotate = [&](SDValue R, unsigned RotAmt) {
+ return DAG.getNode(IsROTL ? ISD::ROTL : ISD::ROTR, DL, VT, R,
+ DAG.getConstant(RotAmt, DL, VT));
+ };
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
// We can safely do this using i16 shifts as we're only interested in
@@ -32194,32 +32196,19 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
Amt = DAG.getBitcast(VT, Amt);
// r = VSELECT(r, rot(r, 4), a);
- SDValue M;
- M = DAG.getNode(
- ISD::OR, DL, VT,
- DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
- DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
- R = SignBitSelect(VT, Amt, M, R);
+ R = SignBitSelect(VT, Amt, BuildRotate(R, 4), R);
// a += a
Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
// r = VSELECT(r, rot(r, 2), a);
- M = DAG.getNode(
- ISD::OR, DL, VT,
- DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
- DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
- R = SignBitSelect(VT, Amt, M, R);
+ R = SignBitSelect(VT, Amt, BuildRotate(R, 2), R);
// a += a
Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
// return VSELECT(r, rot(r, 1), a);
- M = DAG.getNode(
- ISD::OR, DL, VT,
- DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
- DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
- return SignBitSelect(VT, Amt, M, R);
+ return SignBitSelect(VT, Amt, BuildRotate(R, 1), R);
}
bool IsSplatAmt = DAG.isSplatValue(Amt);
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 967f26f70946a..87b88b7d2ba1b 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -12,49 +12,34 @@
define <16 x i8> @var_rotl_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_rotl_v16i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: movdqa %xmm0, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm3
+; GFNISSE-NEXT: movdqa %xmm0, %xmm2
+; GFNISSE-NEXT: psllw $5, %xmm1
+; GFNISSE-NEXT: movdqa %xmm0, %xmm3
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: por %xmm0, %xmm3
-; GFNISSE-NEXT: psllw $5, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1
; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm3
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm3
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: por %xmm0, %xmm3
-; GFNISSE-NEXT: paddb %xmm2, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm3
-; GFNISSE-NEXT: paddb %xmm1, %xmm3
-; GFNISSE-NEXT: por %xmm0, %xmm3
-; GFNISSE-NEXT: paddb %xmm2, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm3
+; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
; GFNISSE-NEXT: movdqa %xmm1, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: var_rotl_v16i8:
; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpsllw $5, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: retq
@@ -93,48 +78,33 @@ define <16 x i8> @var_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_rotr_v16i8:
; GFNISSE: # %bb.0:
; GFNISSE-NEXT: movdqa %xmm0, %xmm2
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
+; GFNISSE-NEXT: psllw $5, %xmm1
+; GFNISSE-NEXT: movdqa %xmm0, %xmm3
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: por %xmm0, %xmm3
-; GFNISSE-NEXT: pxor %xmm0, %xmm0
-; GFNISSE-NEXT: psubb %xmm1, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; GFNISSE-NEXT: movdqa %xmm2, %xmm3
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: por %xmm1, %xmm3
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: paddb %xmm2, %xmm3
-; GFNISSE-NEXT: por %xmm1, %xmm3
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
; GFNISSE-NEXT: movdqa %xmm2, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: var_rotr_v16i8:
; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; GFNIAVX1OR2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX1OR2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; GFNIAVX1OR2-NEXT: vpsllw $5, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: retq
@@ -415,55 +385,36 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
; GFNISSE: # %bb.0:
; GFNISSE-NEXT: movdqa %xmm2, %xmm4
; GFNISSE-NEXT: movdqa %xmm0, %xmm2
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm6 = [16909320,16909320]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm7
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm7
-; GFNISSE-NEXT: por %xmm0, %xmm7
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT: movdqa %xmm0, %xmm6
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm6
; GFNISSE-NEXT: psllw $5, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm7, %xmm2
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm0
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm9
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm9
-; GFNISSE-NEXT: por %xmm0, %xmm9
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
+; GFNISSE-NEXT: movdqa %xmm2, %xmm7
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm7
; GFNISSE-NEXT: paddb %xmm4, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm9, %xmm2
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
-; GFNISSE-NEXT: movdqa %xmm2, %xmm10
-; GFNISSE-NEXT: paddb %xmm2, %xmm10
-; GFNISSE-NEXT: por %xmm0, %xmm10
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm7, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNISSE-NEXT: movdqa %xmm2, %xmm8
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm8
; GFNISSE-NEXT: paddb %xmm4, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm2
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2
; GFNISSE-NEXT: movdqa %xmm1, %xmm4
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm4
-; GFNISSE-NEXT: por %xmm0, %xmm4
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm4
; GFNISSE-NEXT: psllw $5, %xmm3
; GFNISSE-NEXT: movdqa %xmm3, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm0
; GFNISSE-NEXT: movdqa %xmm1, %xmm4
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm4
-; GFNISSE-NEXT: por %xmm0, %xmm4
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm4
; GFNISSE-NEXT: paddb %xmm3, %xmm3
; GFNISSE-NEXT: movdqa %xmm3, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
; GFNISSE-NEXT: movdqa %xmm1, %xmm4
-; GFNISSE-NEXT: paddb %xmm1, %xmm4
-; GFNISSE-NEXT: por %xmm0, %xmm4
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm4
; GFNISSE-NEXT: paddb %xmm3, %xmm3
; GFNISSE-NEXT: movdqa %xmm3, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
@@ -473,45 +424,29 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
; GFNIAVX1-LABEL: var_rotl_v32i8:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
; GFNIAVX1-NEXT: # xmm3 = mem[0,0]
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX1-NEXT: # xmm5 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
-; GFNIAVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; GFNIAVX1-NEXT: vpsllw $5, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX1-NEXT: # xmm8 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9
-; GFNIAVX1-NEXT: vpor %xmm7, %xmm9, %xmm7
-; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX1-NEXT: # xmm7 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm9
-; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm10
-; GFNIAVX1-NEXT: vpor %xmm9, %xmm10, %xmm9
-; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm9, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNIAVX1-NEXT: # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm2, %xmm2
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm3
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm0, %xmm5
-; GFNIAVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
; GFNIAVX1-NEXT: vpsllw $5, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm3
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm0, %xmm3
-; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm0, %xmm3
; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -519,38 +454,26 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
;
; GFNIAVX2-LABEL: var_rotl_v32i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512VL-LABEL: var_rotl_v32i8:
; GFNIAVX512VL: # %bb.0:
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: retq
@@ -575,108 +498,70 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_rotr_v32i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm0, %xmm5
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm0
-; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm7 = [16909320,16909320]
-; GFNISSE-NEXT: movdqa %xmm5, %xmm8
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm8
-; GFNISSE-NEXT: por %xmm0, %xmm8
-; GFNISSE-NEXT: pxor %xmm4, %xmm4
-; GFNISSE-NEXT: pxor %xmm0, %xmm0
-; GFNISSE-NEXT: psubb %xmm2, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm5
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT: movdqa %xmm5, %xmm9
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm9
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT: movdqa %xmm5, %xmm10
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm10
-; GFNISSE-NEXT: por %xmm9, %xmm10
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm5
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT: movdqa %xmm5, %xmm10
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10
-; GFNISSE-NEXT: movdqa %xmm5, %xmm11
-; GFNISSE-NEXT: paddb %xmm5, %xmm11
-; GFNISSE-NEXT: por %xmm10, %xmm11
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm5
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm6
-; GFNISSE-NEXT: por %xmm0, %xmm6
-; GFNISSE-NEXT: psubb %xmm3, %xmm4
+; GFNISSE-NEXT: movdqa %xmm2, %xmm4
+; GFNISSE-NEXT: movdqa %xmm0, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT: movdqa %xmm0, %xmm6
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm6
; GFNISSE-NEXT: psllw $5, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm2
-; GFNISSE-NEXT: por %xmm0, %xmm2
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
+; GFNISSE-NEXT: movdqa %xmm2, %xmm7
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm7
; GFNISSE-NEXT: paddb %xmm4, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: paddb %xmm1, %xmm2
-; GFNISSE-NEXT: por %xmm0, %xmm2
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm7, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT: movdqa %xmm2, %xmm8
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm8
; GFNISSE-NEXT: paddb %xmm4, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT: movdqa %xmm1, %xmm4
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm4
+; GFNISSE-NEXT: psllw $5, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm4
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm4
+; GFNISSE-NEXT: paddb %xmm3, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm4
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm4
+; GFNISSE-NEXT: paddb %xmm3, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
+; GFNISSE-NEXT: movdqa %xmm2, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: var_rotr_v32i8:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
; GFNIAVX1-NEXT: # xmm3 = mem[0,0]
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX1-NEXT: # xmm5 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
-; GFNIAVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; GFNIAVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; GFNIAVX1-NEXT: vpsubb %xmm6, %xmm7, %xmm6
-; GFNIAVX1-NEXT: vpsllw $5, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm8
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX1-NEXT: # xmm9 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm2, %xmm10
-; GFNIAVX1-NEXT: vpor %xmm8, %xmm10, %xmm8
-; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX1-NEXT: # xmm8 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm10
-; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm11
-; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10
-; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm10, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT: # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm2, %xmm2
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm3
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm0, %xmm5
-; GFNIAVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
-; GFNIAVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1
; GFNIAVX1-NEXT: vpsllw $5, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm3
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm0, %xmm3
-; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm0, %xmm3
; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -684,40 +569,26 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
;
; GFNIAVX2-LABEL: var_rotr_v32i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
-; GFNIAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX2-NEXT: vpsubb %ymm1, %ymm3, %ymm1
; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512VL-LABEL: var_rotr_v32i8:
; GFNIAVX512VL: # %bb.0:
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: retq
@@ -1115,103 +986,66 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
; GFNISSE: # %bb.0:
; GFNISSE-NEXT: movdqa %xmm4, %xmm8
; GFNISSE-NEXT: movdqa %xmm0, %xmm4
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
-; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm10 = [16909320,16909320]
-; GFNISSE-NEXT: movdqa %xmm4, %xmm11
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm11
-; GFNISSE-NEXT: por %xmm0, %xmm11
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT: movdqa %xmm0, %xmm10
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10
; GFNISSE-NEXT: psllw $5, %xmm8
; GFNISSE-NEXT: movdqa %xmm8, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm4
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm0
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT: movdqa %xmm4, %xmm13
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm13
-; GFNISSE-NEXT: por %xmm0, %xmm13
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm4
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
+; GFNISSE-NEXT: movdqa %xmm4, %xmm11
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm11
; GFNISSE-NEXT: paddb %xmm8, %xmm8
; GFNISSE-NEXT: movdqa %xmm8, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm0
-; GFNISSE-NEXT: movdqa %xmm4, %xmm14
-; GFNISSE-NEXT: paddb %xmm4, %xmm14
-; GFNISSE-NEXT: por %xmm0, %xmm14
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm4
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNISSE-NEXT: movdqa %xmm4, %xmm12
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm12
; GFNISSE-NEXT: paddb %xmm8, %xmm8
; GFNISSE-NEXT: movdqa %xmm8, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm4
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm4
; GFNISSE-NEXT: movdqa %xmm1, %xmm8
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm8
-; GFNISSE-NEXT: por %xmm0, %xmm8
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm8
; GFNISSE-NEXT: psllw $5, %xmm5
; GFNISSE-NEXT: movdqa %xmm5, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm0
; GFNISSE-NEXT: movdqa %xmm1, %xmm8
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm8
-; GFNISSE-NEXT: por %xmm0, %xmm8
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm8
; GFNISSE-NEXT: paddb %xmm5, %xmm5
; GFNISSE-NEXT: movdqa %xmm5, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm0
; GFNISSE-NEXT: movdqa %xmm1, %xmm8
-; GFNISSE-NEXT: paddb %xmm1, %xmm8
-; GFNISSE-NEXT: por %xmm0, %xmm8
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm8
; GFNISSE-NEXT: paddb %xmm5, %xmm5
; GFNISSE-NEXT: movdqa %xmm5, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
; GFNISSE-NEXT: movdqa %xmm2, %xmm5
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm5
-; GFNISSE-NEXT: por %xmm0, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm5
; GFNISSE-NEXT: psllw $5, %xmm6
; GFNISSE-NEXT: movdqa %xmm6, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm0
; GFNISSE-NEXT: movdqa %xmm2, %xmm5
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm5
-; GFNISSE-NEXT: por %xmm0, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm5
; GFNISSE-NEXT: paddb %xmm6, %xmm6
; GFNISSE-NEXT: movdqa %xmm6, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm0
; GFNISSE-NEXT: movdqa %xmm2, %xmm5
-; GFNISSE-NEXT: paddb %xmm2, %xmm5
-; GFNISSE-NEXT: por %xmm0, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm5
; GFNISSE-NEXT: paddb %xmm6, %xmm6
; GFNISSE-NEXT: movdqa %xmm6, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
; GFNISSE-NEXT: movdqa %xmm3, %xmm5
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm5
-; GFNISSE-NEXT: por %xmm0, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm5
; GFNISSE-NEXT: psllw $5, %xmm7
; GFNISSE-NEXT: movdqa %xmm7, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm0
; GFNISSE-NEXT: movdqa %xmm3, %xmm5
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm5
-; GFNISSE-NEXT: por %xmm0, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm5
; GFNISSE-NEXT: paddb %xmm7, %xmm7
; GFNISSE-NEXT: movdqa %xmm7, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm0
; GFNISSE-NEXT: movdqa %xmm3, %xmm5
-; GFNISSE-NEXT: paddb %xmm3, %xmm5
-; GFNISSE-NEXT: por %xmm0, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm5
; GFNISSE-NEXT: paddb %xmm7, %xmm7
; GFNISSE-NEXT: movdqa %xmm7, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3
@@ -1220,79 +1054,51 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
;
; GFNIAVX1-LABEL: var_rotl_v64i8:
; GFNIAVX1: # %bb.0:
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm6, %xmm7
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm5, %xmm6
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
; GFNIAVX1-NEXT: # xmm5 = mem[0,0]
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
-; GFNIAVX1-NEXT: vpor %xmm7, %xmm8, %xmm7
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
-; GFNIAVX1-NEXT: vpsllw $5, %xmm8, %xmm8
-; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm6, %xmm9
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
+; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm8, %xmm6, %xmm8
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
; GFNIAVX1-NEXT: # xmm6 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm9, %xmm10
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX1-NEXT: # xmm7 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm9, %xmm11
-; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10
-; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm11
-; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm9, %xmm9
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX1-NEXT: # xmm8 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm9, %xmm10
-; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm12
-; GFNIAVX1-NEXT: vpor %xmm10, %xmm12, %xmm10
-; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11
-; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm9, %xmm9
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm10
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm0, %xmm11
-; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm8, %xmm9
+; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm7
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm8
; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm10, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm0, %xmm10
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm0, %xmm11
-; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm0, %xmm8
; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm10, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm0, %xmm10
-; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm11
-; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm0, %xmm8
; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm10, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm9
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm10
-; GFNIAVX1-NEXT: vpor %xmm9, %xmm10, %xmm9
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm10
-; GFNIAVX1-NEXT: vpsllw $5, %xmm10, %xmm10
-; GFNIAVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm9
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm11
-; GFNIAVX1-NEXT: vpor %xmm9, %xmm11, %xmm9
-; GFNIAVX1-NEXT: vpaddb %xmm10, %xmm10, %xmm10
-; GFNIAVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9
-; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm11
-; GFNIAVX1-NEXT: vpor %xmm9, %xmm11, %xmm9
-; GFNIAVX1-NEXT: vpaddb %xmm10, %xmm10, %xmm10
-; GFNIAVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; GFNIAVX1-NEXT: vpsllw $5, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm7
+; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
+; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5
-; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
; GFNIAVX1-NEXT: vpsllw $5, %xmm3, %xmm3
; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm1, %xmm5
-; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm1, %xmm4
; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm5
-; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm1, %xmm4
; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1300,39 +1106,25 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
;
; GFNIAVX2-LABEL: var_rotl_v64i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
-; GFNIAVX2-NEXT: vpor %ymm5, %ymm7, %ymm5
; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2
; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm7
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm9
-; GFNIAVX2-NEXT: vpor %ymm7, %ymm9, %ymm7
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6
; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm7, %ymm0, %ymm9
-; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm10
-; GFNIAVX2-NEXT: vpor %ymm9, %ymm10, %ymm9
+; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm9, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm4
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
; GFNIAVX2-NEXT: vpsllw $5, %ymm3, %ymm3
; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm1, %ymm4
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm7, %ymm1, %ymm2
-; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm4
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; GFNIAVX2-NEXT: retq
@@ -1340,40 +1132,26 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
; GFNIAVX512VL-LABEL: var_rotl_v64i8:
; GFNIAVX512VL: # %bb.0:
; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6
-; GFNIAVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
-; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; GFNIAVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9
-; GFNIAVX512VL-NEXT: vpor %ymm7, %ymm9, %ymm7
-; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9
-; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm10
-; GFNIAVX512VL-NEXT: vpor %ymm9, %ymm10, %ymm9
-; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6
+; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm2, %ymm7
+; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm5
-; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm5, %ymm3
; GFNIAVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm3
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -1399,189 +1177,121 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_rotr_v64i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm0, %xmm9
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm0
-; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm11 = [16909320,16909320]
-; GFNISSE-NEXT: movdqa %xmm9, %xmm12
+; GFNISSE-NEXT: movdqa %xmm4, %xmm8
+; GFNISSE-NEXT: movdqa %xmm0, %xmm4
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT: movdqa %xmm0, %xmm10
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10
+; GFNISSE-NEXT: psllw $5, %xmm8
+; GFNISSE-NEXT: movdqa %xmm8, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm4
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
+; GFNISSE-NEXT: movdqa %xmm4, %xmm11
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm11
+; GFNISSE-NEXT: paddb %xmm8, %xmm8
+; GFNISSE-NEXT: movdqa %xmm8, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm4
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT: movdqa %xmm4, %xmm12
; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm12
-; GFNISSE-NEXT: por %xmm0, %xmm12
-; GFNISSE-NEXT: pxor %xmm8, %xmm8
-; GFNISSE-NEXT: pxor %xmm0, %xmm0
-; GFNISSE-NEXT: psubb %xmm4, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm9
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT: movdqa %xmm9, %xmm13
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm13
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT: movdqa %xmm9, %xmm14
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm14
-; GFNISSE-NEXT: por %xmm13, %xmm14
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm9
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT: movdqa %xmm9, %xmm14
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm14
-; GFNISSE-NEXT: movdqa %xmm9, %xmm15
-; GFNISSE-NEXT: paddb %xmm9, %xmm15
-; GFNISSE-NEXT: por %xmm14, %xmm15
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm15, %xmm9
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm14
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm14
-; GFNISSE-NEXT: por %xmm0, %xmm14
-; GFNISSE-NEXT: pxor %xmm0, %xmm0
-; GFNISSE-NEXT: psubb %xmm5, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm5
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm5
-; GFNISSE-NEXT: movdqa %xmm1, %xmm14
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm14
-; GFNISSE-NEXT: por %xmm5, %xmm14
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm5
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm5
-; GFNISSE-NEXT: movdqa %xmm1, %xmm14
-; GFNISSE-NEXT: paddb %xmm1, %xmm14
-; GFNISSE-NEXT: por %xmm5, %xmm14
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm1
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm0
+; GFNISSE-NEXT: paddb %xmm8, %xmm8
+; GFNISSE-NEXT: movdqa %xmm8, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm4
+; GFNISSE-NEXT: movdqa %xmm1, %xmm8
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm8
+; GFNISSE-NEXT: psllw $5, %xmm5
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm8
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm8
+; GFNISSE-NEXT: paddb %xmm5, %xmm5
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm8
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm8
+; GFNISSE-NEXT: paddb %xmm5, %xmm5
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
; GFNISSE-NEXT: movdqa %xmm2, %xmm5
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm5
-; GFNISSE-NEXT: por %xmm0, %xmm5
-; GFNISSE-NEXT: pxor %xmm0, %xmm0
-; GFNISSE-NEXT: psubb %xmm6, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm5
+; GFNISSE-NEXT: psllw $5, %xmm6
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
; GFNISSE-NEXT: movdqa %xmm2, %xmm5
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm5
-; GFNISSE-NEXT: movdqa %xmm2, %xmm6
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm6
-; GFNISSE-NEXT: por %xmm5, %xmm6
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm2
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm5
+; GFNISSE-NEXT: paddb %xmm6, %xmm6
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
; GFNISSE-NEXT: movdqa %xmm2, %xmm5
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm5
-; GFNISSE-NEXT: movdqa %xmm2, %xmm6
-; GFNISSE-NEXT: paddb %xmm2, %xmm6
-; GFNISSE-NEXT: por %xmm5, %xmm6
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm2
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm0
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm5
+; GFNISSE-NEXT: paddb %xmm6, %xmm6
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
+; GFNISSE-NEXT: movdqa %xmm3, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm5
+; GFNISSE-NEXT: psllw $5, %xmm7
+; GFNISSE-NEXT: movdqa %xmm7, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm5
+; GFNISSE-NEXT: paddb %xmm7, %xmm7
+; GFNISSE-NEXT: movdqa %xmm7, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3
; GFNISSE-NEXT: movdqa %xmm3, %xmm5
; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm5
-; GFNISSE-NEXT: por %xmm0, %xmm5
-; GFNISSE-NEXT: psubb %xmm7, %xmm8
-; GFNISSE-NEXT: psllw $5, %xmm8
-; GFNISSE-NEXT: movdqa %xmm8, %xmm0
+; GFNISSE-NEXT: paddb %xmm7, %xmm7
+; GFNISSE-NEXT: movdqa %xmm7, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0
-; GFNISSE-NEXT: movdqa %xmm3, %xmm4
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm4
-; GFNISSE-NEXT: por %xmm0, %xmm4
-; GFNISSE-NEXT: paddb %xmm8, %xmm8
-; GFNISSE-NEXT: movdqa %xmm8, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm0
-; GFNISSE-NEXT: movdqa %xmm3, %xmm4
-; GFNISSE-NEXT: paddb %xmm3, %xmm4
-; GFNISSE-NEXT: por %xmm0, %xmm4
-; GFNISSE-NEXT: paddb %xmm8, %xmm8
-; GFNISSE-NEXT: movdqa %xmm8, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: var_rotr_v64i8:
; GFNIAVX1: # %bb.0:
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm7, %xmm6
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm5, %xmm6
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
; GFNIAVX1-NEXT: # xmm5 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm7, %xmm8
-; GFNIAVX1-NEXT: vpor %xmm6, %xmm8, %xmm8
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
-; GFNIAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpsubb %xmm9, %xmm6, %xmm9
-; GFNIAVX1-NEXT: vpsllw $5, %xmm9, %xmm9
-; GFNIAVX1-NEXT: vpblendvb %xmm9, %xmm8, %xmm7, %xmm10
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNIAVX1-NEXT: # xmm7 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm10, %xmm11
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX1-NEXT: # xmm8 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm10, %xmm12
-; GFNIAVX1-NEXT: vpor %xmm11, %xmm12, %xmm11
-; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm12
-; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm11, %xmm10, %xmm10
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX1-NEXT: # xmm9 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm10, %xmm11
-; GFNIAVX1-NEXT: vpaddb %xmm10, %xmm10, %xmm13
-; GFNIAVX1-NEXT: vpor %xmm11, %xmm13, %xmm11
-; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12
-; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm11, %xmm10, %xmm10
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm11
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm0, %xmm12
-; GFNIAVX1-NEXT: vpor %xmm11, %xmm12, %xmm11
-; GFNIAVX1-NEXT: vpsubb %xmm2, %xmm6, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
+; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm8, %xmm6, %xmm8
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT: # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm8, %xmm9
+; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm9, %xmm8, %xmm7
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm8
; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm11, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm0, %xmm11
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm0, %xmm12
-; GFNIAVX1-NEXT: vpor %xmm11, %xmm12, %xmm11
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm0, %xmm8
; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm11, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm0, %xmm11
-; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm12
-; GFNIAVX1-NEXT: vpor %xmm11, %xmm12, %xmm11
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm0, %xmm8
; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm11, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm10
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm11
-; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm11
-; GFNIAVX1-NEXT: vpsubb %xmm11, %xmm6, %xmm11
-; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm11
-; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm10
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm12
-; GFNIAVX1-NEXT: vpor %xmm10, %xmm12, %xmm10
-; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11
-; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm2, %xmm10
-; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm12
-; GFNIAVX1-NEXT: vpor %xmm10, %xmm12, %xmm10
-; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11
-; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; GFNIAVX1-NEXT: vpsllw $5, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm7
+; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
+; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5
-; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
-; GFNIAVX1-NEXT: vpsubb %xmm3, %xmm6, %xmm3
; GFNIAVX1-NEXT: vpsllw $5, %xmm3, %xmm3
; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm1, %xmm5
-; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm1, %xmm4
; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm5
-; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm1, %xmm4
; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1589,42 +1299,25 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
;
; GFNIAVX2-LABEL: var_rotr_v64i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
-; GFNIAVX2-NEXT: vpor %ymm5, %ymm7, %ymm5
-; GFNIAVX2-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; GFNIAVX2-NEXT: vpsubb %ymm2, %ymm7, %ymm2
; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2
; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm8
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm0, %ymm10
-; GFNIAVX2-NEXT: vpor %ymm8, %ymm10, %ymm8
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6
; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm0
-; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm10
-; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm11
-; GFNIAVX2-NEXT: vpor %ymm10, %ymm11, %ymm10
+; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm4
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
-; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm7, %ymm3
; GFNIAVX2-NEXT: vpsllw $5, %ymm3, %ymm3
; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm1, %ymm4
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm1, %ymm2
-; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm4
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; GFNIAVX2-NEXT: retq
@@ -1632,41 +1325,26 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
; GFNIAVX512VL-LABEL: var_rotr_v64i8:
; GFNIAVX512VL: # %bb.0:
; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
+; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6
-; GFNIAVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
-; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; GFNIAVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0]
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9
-; GFNIAVX512VL-NEXT: vpor %ymm7, %ymm9, %ymm7
-; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9
-; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11
-; GFNIAVX512VL-NEXT: vpor %ymm9, %ymm11, %ymm9
-; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6
+; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm2, %ymm7
+; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm5
-; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm5, %ymm3
; GFNIAVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm10, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm3
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index c0b0446433bd8..a59c8c8c2dbc5 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -467,18 +467,18 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
;
; AVX512F-LABEL: var_funnnel_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm2
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
@@ -486,18 +486,18 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
;
; AVX512VL-LABEL: var_funnnel_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm3
; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm3
; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index c0dbbf0571c51..56d32542d9e0a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -129,37 +129,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm4
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135]
+; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw $6, %ymm2, %ymm4
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm6
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567]
+; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm4
+; AVX512F-NEXT: vpsllw $6, %ymm2, %ymm6
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728]
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4))
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm6
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143]
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm4
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm6
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm8 = [2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152]
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm8 & (zmm6 ^ zmm4))
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm4
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3))
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm4
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm8 & (zmm4 ^ zmm3))
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
@@ -169,37 +169,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm4
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135]
+; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllw $6, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm6
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567]
+; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsllw $6, %ymm2, %ymm6
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728]
; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm7 & (ymm6 ^ ymm4))
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm6
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143]
+; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm4
+; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm6
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152]
; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm8 & (ymm6 ^ ymm4))
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4
+; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm4
; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
+; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm4
; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm8 & (ymm4 ^ ymm3))
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
More information about the llvm-commits
mailing list