[llvm] [X86] LowerRotate - expand vXi8 non-uniform variable rotates using uniform constant rotates (PR #189986)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 1 08:58:12 PDT 2026


https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/189986

We expand vXi8 non-uniform variable rotates as a sequence of uniform constant rotates along with a SELECT depending on whether the original rotate amount needs it

This patch removes premature uniform constant rotate expansion to the OR(SHL,SRL) sequences to allow GFNI targets to use single VGF2P8AFFINEQB calls

>From 551f516fc24918c63f7e24a6bd25e4ce268f1331 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 1 Apr 2026 16:29:20 +0100
Subject: [PATCH] [X86] LowerRotate - expand vXi8 non-uniform variable rotates
 using uniform constant rotates

We expand vXi8 non-uniform variable rotates as a sequence of uniform constant rotates along with a SELECT depending on whether the original rotate amount needs it

This patch removes premature uniform constant rotate expansion to the OR(SHL,SRL) sequences to allow GFNI targets to use single VGF2P8AFFINEQB calls
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  29 +-
 llvm/test/CodeGen/X86/gfni-rotates.ll        | 894 ++++++-------------
 llvm/test/CodeGen/X86/vector-fshr-rot-256.ll |  24 +-
 llvm/test/CodeGen/X86/vector-fshr-rot-512.ll |  60 +-
 4 files changed, 337 insertions(+), 670 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e1a7876e30de0..d1de545f86b6a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32177,14 +32177,16 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getSelect(DL, SelVT, C, V0, V1);
     };
 
-    // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
-    if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
+    // ISD::ROTR is currently only profitable on GFNI/AVX512+VPTERNLOG targets.
+    if (!IsROTL && !useVPTERNLOG(Subtarget, VT) && !Subtarget.hasGFNI()) {
       Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
       IsROTL = true;
     }
 
-    unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
-    unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
+    auto BuildRotate = [&](SDValue R, unsigned RotAmt) {
+      return DAG.getNode(IsROTL ? ISD::ROTL : ISD::ROTR, DL, VT, R,
+                         DAG.getConstant(RotAmt, DL, VT));
+    };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
     // We can safely do this using i16 shifts as we're only interested in
@@ -32194,32 +32196,19 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
     Amt = DAG.getBitcast(VT, Amt);
 
     // r = VSELECT(r, rot(r, 4), a);
-    SDValue M;
-    M = DAG.getNode(
-        ISD::OR, DL, VT,
-        DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
-        DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
-    R = SignBitSelect(VT, Amt, M, R);
+    R = SignBitSelect(VT, Amt, BuildRotate(R, 4), R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
 
     // r = VSELECT(r, rot(r, 2), a);
-    M = DAG.getNode(
-        ISD::OR, DL, VT,
-        DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
-        DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
-    R = SignBitSelect(VT, Amt, M, R);
+    R = SignBitSelect(VT, Amt, BuildRotate(R, 2), R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
 
     // return VSELECT(r, rot(r, 1), a);
-    M = DAG.getNode(
-        ISD::OR, DL, VT,
-        DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
-        DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
-    return SignBitSelect(VT, Amt, M, R);
+    return SignBitSelect(VT, Amt, BuildRotate(R, 1), R);
   }
 
   bool IsSplatAmt = DAG.isSplatValue(Amt);
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 967f26f70946a..87b88b7d2ba1b 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -12,49 +12,34 @@
 define <16 x i8> @var_rotl_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: var_rotl_v16i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
+; GFNISSE-NEXT:    psllw $5, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm3
 ; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT:    por %xmm0, %xmm3
-; GFNISSE-NEXT:    psllw $5, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm3
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
 ; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT:    por %xmm0, %xmm3
-; GFNISSE-NEXT:    paddb %xmm2, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; GFNISSE-NEXT:    paddb %xmm1, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm3
-; GFNISSE-NEXT:    paddb %xmm1, %xmm3
-; GFNISSE-NEXT:    por %xmm0, %xmm3
-; GFNISSE-NEXT:    paddb %xmm2, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    paddb %xmm1, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: var_rotl_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpsllw $5, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
@@ -93,48 +78,33 @@ define <16 x i8> @var_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: var_rotr_v16i8:
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
+; GFNISSE-NEXT:    psllw $5, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm3
 ; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT:    por %xmm0, %xmm3
-; GFNISSE-NEXT:    pxor %xmm0, %xmm0
-; GFNISSE-NEXT:    psubb %xmm1, %xmm0
-; GFNISSE-NEXT:    psllw $5, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
 ; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT:    por %xmm1, %xmm3
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    paddb %xmm1, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    paddb %xmm2, %xmm3
-; GFNISSE-NEXT:    por %xmm1, %xmm3
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    paddb %xmm1, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: var_rotr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; GFNIAVX1OR2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX1OR2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpsllw $5, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
@@ -415,55 +385,36 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm6 = [16909320,16909320]
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm7
-; GFNISSE-NEXT:    por %xmm0, %xmm7
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm6
 ; GFNISSE-NEXT:    psllw $5, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm9
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm9
-; GFNISSE-NEXT:    por %xmm0, %xmm9
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm7
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm9, %xmm2
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm10
-; GFNISSE-NEXT:    paddb %xmm2, %xmm10
-; GFNISSE-NEXT:    por %xmm0, %xmm10
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm4
-; GFNISSE-NEXT:    por %xmm0, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm4
 ; GFNISSE-NEXT:    psllw $5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm4
-; GFNISSE-NEXT:    por %xmm0, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
-; GFNISSE-NEXT:    paddb %xmm1, %xmm4
-; GFNISSE-NEXT:    por %xmm0, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm4
 ; GFNISSE-NEXT:    paddb %xmm3, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
@@ -473,45 +424,29 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNIAVX1-LABEL: var_rotl_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
-; GFNIAVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9
-; GFNIAVX1-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm2, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm9, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
+; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm5
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -519,38 +454,26 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotl_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512VL-LABEL: var_rotl_v32i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    retq
@@ -575,108 +498,70 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: var_rotr_v32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm0
-; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm7 = [16909320,16909320]
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm8
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm8
-; GFNISSE-NEXT:    por %xmm0, %xmm8
-; GFNISSE-NEXT:    pxor %xmm4, %xmm4
-; GFNISSE-NEXT:    pxor %xmm0, %xmm0
-; GFNISSE-NEXT:    psubb %xmm2, %xmm0
-; GFNISSE-NEXT:    psllw $5, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm9
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm9
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm10
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm10
-; GFNISSE-NEXT:    por %xmm9, %xmm10
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm5
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm10
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm11
-; GFNISSE-NEXT:    paddb %xmm5, %xmm11
-; GFNISSE-NEXT:    por %xmm10, %xmm11
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm6
-; GFNISSE-NEXT:    por %xmm0, %xmm6
-; GFNISSE-NEXT:    psubb %xmm3, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm6
 ; GFNISSE-NEXT:    psllw $5, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm2
-; GFNISSE-NEXT:    por %xmm0, %xmm2
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm7
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
-; GFNISSE-NEXT:    paddb %xmm1, %xmm2
-; GFNISSE-NEXT:    por %xmm0, %xmm2
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm7, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm4, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm4
+; GFNISSE-NEXT:    psllw $5, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm4
+; GFNISSE-NEXT:    paddb %xmm3, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm7, %xmm4
+; GFNISSE-NEXT:    paddb %xmm3, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: var_rotr_v32i8:
 ; GFNIAVX1:       # %bb.0:
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm3 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
-; GFNIAVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; GFNIAVX1-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; GFNIAVX1-NEXT:    vpsubb %xmm6, %xmm7, %xmm6
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm8
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm2, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm8, %xmm10, %xmm8
-; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm8, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm11
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm10, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
+; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm3, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm5
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
-; GFNIAVX1-NEXT:    vpsubb %xmm1, %xmm7, %xmm1
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm0, %xmm3
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm0, %xmm3
 ; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -684,40 +569,26 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotr_v32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; GFNIAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX2-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512VL-LABEL: var_rotr_v32i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    retq
@@ -1115,103 +986,66 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
-; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm10 = [16909320,16909320]
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm11
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
-; GFNISSE-NEXT:    por %xmm0, %xmm11
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm10
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
 ; GFNISSE-NEXT:    psllw $5, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm13
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm13
-; GFNISSE-NEXT:    por %xmm0, %xmm13
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm11
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm4
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm4, %xmm14
-; GFNISSE-NEXT:    paddb %xmm4, %xmm14
-; GFNISSE-NEXT:    por %xmm0, %xmm14
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm12
 ; GFNISSE-NEXT:    paddb %xmm8, %xmm8
 ; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm4
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm4
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm8
-; GFNISSE-NEXT:    por %xmm0, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm8
 ; GFNISSE-NEXT:    psllw $5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm8
-; GFNISSE-NEXT:    por %xmm0, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
-; GFNISSE-NEXT:    paddb %xmm1, %xmm8
-; GFNISSE-NEXT:    por %xmm0, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm8
 ; GFNISSE-NEXT:    paddb %xmm5, %xmm5
 ; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
-; GFNISSE-NEXT:    por %xmm0, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm5
 ; GFNISSE-NEXT:    psllw $5, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm5
-; GFNISSE-NEXT:    por %xmm0, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm6, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    paddb %xmm2, %xmm5
-; GFNISSE-NEXT:    por %xmm0, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm6, %xmm6
 ; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
-; GFNISSE-NEXT:    por %xmm0, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm5
 ; GFNISSE-NEXT:    psllw $5, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm5
-; GFNISSE-NEXT:    por %xmm0, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm7, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
-; GFNISSE-NEXT:    paddb %xmm3, %xmm5
-; GFNISSE-NEXT:    por %xmm0, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm5
 ; GFNISSE-NEXT:    paddb %xmm7, %xmm7
 ; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
@@ -1220,79 +1054,51 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ;
 ; GFNIAVX1-LABEL: var_rotl_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm6, %xmm7
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm6
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
 ; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
-; GFNIAVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm6, %xmm9
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
+; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm8, %xmm6, %xmm8
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
 ; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm9, %xmm10
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm9, %xmm11
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm11
-; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm9, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm12
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm12, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm11
-; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm10
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm11
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm8, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm9, %xmm8, %xmm7
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm8
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm10, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm0, %xmm10
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm0, %xmm11
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm10, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm0, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm11
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm0, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm10, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm9
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm10
-; GFNIAVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm10
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vpblendvb %xmm10, %xmm9, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm2, %xmm9
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm2, %xmm11
-; GFNIAVX1-NEXT:    vpor %xmm9, %xmm11, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm10, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vpblendvb %xmm10, %xmm9, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm11
-; GFNIAVX1-NEXT:    vpor %xmm9, %xmm11, %xmm9
-; GFNIAVX1-NEXT:    vpaddb %xmm10, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vpblendvb %xmm10, %xmm9, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1300,39 +1106,25 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotl_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
-; GFNIAVX2-NEXT:    vpor %ymm5, %ymm7, %ymm5
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm7
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm9
-; GFNIAVX2-NEXT:    vpor %ymm7, %ymm9, %ymm7
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm0, %ymm9
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm10
-; GFNIAVX2-NEXT:    vpor %ymm9, %ymm10, %ymm9
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm9, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm1, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
@@ -1340,40 +1132,26 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNIAVX512VL-LABEL: var_rotl_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6
-; GFNIAVX512VL-NEXT:    vpor %ymm4, %ymm6, %ymm4
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9
-; GFNIAVX512VL-NEXT:    vpor %ymm7, %ymm9, %ymm7
-; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm7, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9
-; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm10
-; GFNIAVX512VL-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm9, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6
+; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm2, %ymm7
+; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm5
-; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -1399,189 +1177,121 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNISSE-LABEL: var_rotr_v64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm0, %xmm9
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm0
-; GFNISSE-NEXT:    pmovsxdq {{.*#+}} xmm11 = [16909320,16909320]
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm12
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm10
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm10
+; GFNISSE-NEXT:    psllw $5, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm11
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm11
+; GFNISSE-NEXT:    paddb %xmm8, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm4
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm12
-; GFNISSE-NEXT:    por %xmm0, %xmm12
-; GFNISSE-NEXT:    pxor %xmm8, %xmm8
-; GFNISSE-NEXT:    pxor %xmm0, %xmm0
-; GFNISSE-NEXT:    psubb %xmm4, %xmm0
-; GFNISSE-NEXT:    psllw $5, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm9
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm13
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm13
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm14
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm14
-; GFNISSE-NEXT:    por %xmm13, %xmm14
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm9
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm14
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm14
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm15
-; GFNISSE-NEXT:    paddb %xmm9, %xmm15
-; GFNISSE-NEXT:    por %xmm14, %xmm15
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm15, %xmm9
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm14
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm14
-; GFNISSE-NEXT:    por %xmm0, %xmm14
-; GFNISSE-NEXT:    pxor %xmm0, %xmm0
-; GFNISSE-NEXT:    psubb %xmm5, %xmm0
-; GFNISSE-NEXT:    psllw $5, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm5
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm14
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm14
-; GFNISSE-NEXT:    por %xmm5, %xmm14
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm5
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm14
-; GFNISSE-NEXT:    paddb %xmm1, %xmm14
-; GFNISSE-NEXT:    por %xmm5, %xmm14
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm0
+; GFNISSE-NEXT:    paddb %xmm8, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm8
+; GFNISSE-NEXT:    psllw $5, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm8
+; GFNISSE-NEXT:    paddb %xmm5, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm8
+; GFNISSE-NEXT:    paddb %xmm5, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm5
-; GFNISSE-NEXT:    por %xmm0, %xmm5
-; GFNISSE-NEXT:    pxor %xmm0, %xmm0
-; GFNISSE-NEXT:    psubb %xmm6, %xmm0
-; GFNISSE-NEXT:    psllw $5, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm5
+; GFNISSE-NEXT:    psllw $5, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm6
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm6
-; GFNISSE-NEXT:    por %xmm5, %xmm6
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
+; GFNISSE-NEXT:    paddb %xmm6, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm6
-; GFNISSE-NEXT:    paddb %xmm2, %xmm6
-; GFNISSE-NEXT:    por %xmm5, %xmm6
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm0
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm5
+; GFNISSE-NEXT:    paddb %xmm6, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm9, %xmm5
+; GFNISSE-NEXT:    psllw $5, %xmm7
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm10, %xmm5
+; GFNISSE-NEXT:    paddb %xmm7, %xmm7
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
 ; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm11, %xmm5
-; GFNISSE-NEXT:    por %xmm0, %xmm5
-; GFNISSE-NEXT:    psubb %xmm7, %xmm8
-; GFNISSE-NEXT:    psllw $5, %xmm8
-; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
+; GFNISSE-NEXT:    paddb %xmm7, %xmm7
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
 ; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm3
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm12, %xmm4
-; GFNISSE-NEXT:    por %xmm0, %xmm4
-; GFNISSE-NEXT:    paddb %xmm8, %xmm8
-; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm13, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
-; GFNISSE-NEXT:    paddb %xmm3, %xmm4
-; GFNISSE-NEXT:    por %xmm0, %xmm4
-; GFNISSE-NEXT:    paddb %xmm8, %xmm8
-; GFNISSE-NEXT:    movdqa %xmm8, %xmm0
-; GFNISSE-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
-; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: var_rotr_v64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
 ; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm7, %xmm6
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm6
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
 ; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm7, %xmm8
-; GFNIAVX1-NEXT:    vpor %xmm6, %xmm8, %xmm8
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpsubb %xmm9, %xmm6, %xmm9
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm9, %xmm9
-; GFNIAVX1-NEXT:    vpblendvb %xmm9, %xmm8, %xmm7, %xmm10
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNIAVX1-NEXT:    # xmm7 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm10, %xmm11
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX1-NEXT:    # xmm8 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm10, %xmm12
-; GFNIAVX1-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm12
-; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm11, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX1-NEXT:    # xmm9 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm10, %xmm11
-; GFNIAVX1-NEXT:    vpaddb %xmm10, %xmm10, %xmm13
-; GFNIAVX1-NEXT:    vpor %xmm11, %xmm13, %xmm11
-; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
-; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm11, %xmm10, %xmm10
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm11
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm12
-; GFNIAVX1-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; GFNIAVX1-NEXT:    vpsubb %xmm2, %xmm6, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8
+; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm8, %xmm6, %xmm8
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT:    # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm8, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm9, %xmm8, %xmm7
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm8
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm11, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm0, %xmm11
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm0, %xmm12
-; GFNIAVX1-NEXT:    vpor %xmm11, %xmm12, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm11, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm0, %xmm11
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm12
-; GFNIAVX1-NEXT:    vpor %xmm11, %xmm12, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm0, %xmm8
 ; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm11, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm10
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm11
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
-; GFNIAVX1-NEXT:    vpsubb %xmm11, %xmm6, %xmm11
-; GFNIAVX1-NEXT:    vpsllw $5, %xmm11, %xmm11
-; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm2, %xmm10
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm2, %xmm12
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm12, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm11
-; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm2, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm12
-; GFNIAVX1-NEXT:    vpor %xmm10, %xmm12, %xmm10
-; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm11
-; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm10, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
-; GFNIAVX1-NEXT:    vpsubb %xmm3, %xmm6, %xmm3
 ; GFNIAVX1-NEXT:    vpsllw $5, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm7, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm8, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm9, %xmm1, %xmm4
-; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm6, %xmm1, %xmm4
 ; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1589,42 +1299,25 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ;
 ; GFNIAVX2-LABEL: var_rotr_v64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
-; GFNIAVX2-NEXT:    vpor %ymm5, %ymm7, %ymm5
-; GFNIAVX2-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; GFNIAVX2-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm8
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm0, %ymm10
-; GFNIAVX2-NEXT:    vpor %ymm8, %ymm10, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm8, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm10
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm11
-; GFNIAVX2-NEXT:    vpor %ymm10, %ymm11, %ymm10
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7
 ; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm10, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; GFNIAVX2-NEXT:    vpsubb %ymm3, %ymm7, %ymm3
 ; GFNIAVX2-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm9, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm1, %ymm2
 ; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
@@ -1632,41 +1325,26 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind {
 ; GFNIAVX512VL-LABEL: var_rotr_v64i8:
 ; GFNIAVX512VL:       # %bb.0:
 ; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6
-; GFNIAVX512VL-NEXT:    vpor %ymm4, %ymm6, %ymm4
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9
-; GFNIAVX512VL-NEXT:    vpor %ymm7, %ymm9, %ymm7
-; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm7, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11
-; GFNIAVX512VL-NEXT:    vpor %ymm9, %ymm11, %ymm9
-; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm9, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6
+; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm2, %ymm7
+; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm5, %ymm0, %ymm5
-; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm8, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm7, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm10, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpor %ymm3, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm6, %ymm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index c0b0446433bd8..a59c8c8c2dbc5 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -467,18 +467,18 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ;
 ; AVX512F-LABEL: var_funnnel_v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; AVX512F-NEXT:    vpsllw $6, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $6, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm2
-; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm3
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
@@ -486,18 +486,18 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ;
 ; AVX512VL-LABEL: var_funnnel_v32i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index c0dbbf0571c51..56d32542d9e0a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -129,37 +129,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512F-LABEL: var_funnnel_v64i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm3
-; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm4
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135]
+; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm3
+; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm4
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $6, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm6
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567]
+; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm4
+; AVX512F-NEXT:    vpsllw $6, %ymm2, %ymm6
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728]
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4))
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $7, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm6
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143]
+; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm4
+; AVX512F-NEXT:    vpsllw $7, %ymm2, %ymm6
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm8 = [2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152]
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm8 & (zmm6 ^ zmm4))
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm4
+; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $6, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm3
-; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm4
+; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm4
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm8 & (zmm4 ^ zmm3))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
@@ -169,37 +169,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512VL-LABEL: var_funnnel_v64i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm3
-; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135]
+; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
 ; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $6, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsrlw $2, %ymm2, %ymm6
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567]
+; AVX512VL-NEXT:    vpsrlw $2, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpsllw $6, %ymm2, %ymm6
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728,3233857728]
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm7 & (ymm6 ^ ymm4))
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $7, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsrlw $1, %ymm2, %ymm6
-; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143]
+; AVX512VL-NEXT:    vpsrlw $1, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpsllw $7, %ymm2, %ymm6
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152,2155905152]
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm8 & (ymm6 ^ ymm4))
 ; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm4
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm4
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm8 & (ymm4 ^ ymm3))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0



More information about the llvm-commits mailing list