[llvm] [X86] LowerRotate - expand vXi8 non-uniform variable rotates using uniform constant rotates (PR #189986)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 08:58:45 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
We expand vXi8 non-uniform variable rotates as a sequence of uniform constant rotates along with a SELECT depending on whether the original rotate amount needs it
This patch removes premature uniform constant rotate expansion to the OR(SHL,SRL) sequences to allow GFNI targets to use single VGF2P8AFFINEQB calls
---
Patch is 76.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189986.diff
4 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+9-20)
- (modified) llvm/test/CodeGen/X86/gfni-rotates.ll (+286-608)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-256.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-512.ll (+30-30)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e1a7876e30de0..d1de545f86b6a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32177,14 +32177,16 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getSelect(DL, SelVT, C, V0, V1);
};
- // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
- if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
+ // ISD::ROTR is currently only profitable on GFNI/AVX512+VPTERNLOG targets.
+ if (!IsROTL && !useVPTERNLOG(Subtarget, VT) && !Subtarget.hasGFNI()) {
Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
IsROTL = true;
}
- unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
- unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
+ auto BuildRotate = [&](SDValue R, unsigned RotAmt) {
+ return DAG.getNode(IsROTL ? ISD::ROTL : ISD::ROTR, DL, VT, R,
+ DAG.getConstant(RotAmt, DL, VT));
+ };
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
// We can safely do this using i16 shifts as we're only interested in
@@ -32194,32 +32196,19 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
Amt = DAG.getBitcast(VT, Amt);
// r = VSELECT(r, rot(r, 4), a);
- SDValue M;
- M = DAG.getNode(
- ISD::OR, DL, VT,
- DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
- DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
- R = SignBitSelect(VT, Amt, M, R);
+ R = SignBitSelect(VT, Amt, BuildRotate(R, 4), R);
// a += a
Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
// r = VSELECT(r, rot(r, 2), a);
- M = DAG.getNode(
- ISD::OR, DL, VT,
- DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
- DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
- R = SignBitSelect(VT, Amt, M, R);
+ R = SignBitSelect(VT, Amt, BuildRotate(R, 2), R);
// a += a
Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
// return VSELECT(r, rot(r, 1), a);
- M = DAG.getNode(
- ISD::OR, DL, VT,
- DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
- DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
- return SignBitSelect(VT, Amt, M, R);
+ return SignBitSelect(VT, Amt, BuildRotate(R, 1), R);
}
bool IsSplatAmt = DAG.isSplatValue(Amt);
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 967f26f70946a..87b88b7d2ba1b 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -12,49 +12,34 @@
define <16 x i8> @var_rotl_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_rotl_v16i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: movdqa %xmm0, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm3
+; GFNISSE-NEXT: movdqa %xmm0, %xmm2
+; GFNISSE-NEXT: psllw $5, %xmm1
+; GFNISSE-NEXT: movdqa %xmm0, %xmm3
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: por %xmm0, %xmm3
-; GFNISSE-NEXT: psllw $5, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1
; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm3
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm3
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: por %xmm0, %xmm3
-; GFNISSE-NEXT: paddb %xmm2, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm3
-; GFNISSE-NEXT: paddb %xmm1, %xmm3
-; GFNISSE-NEXT: por %xmm0, %xmm3
-; GFNISSE-NEXT: paddb %xmm2, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm3
+; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
; GFNISSE-NEXT: movdqa %xmm1, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: var_rotl_v16i8:
; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpsllw $5, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: retq
@@ -93,48 +78,33 @@ define <16 x i8> @var_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_rotr_v16i8:
; GFNISSE: # %bb.0:
; GFNISSE-NEXT: movdqa %xmm0, %xmm2
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
+; GFNISSE-NEXT: psllw $5, %xmm1
+; GFNISSE-NEXT: movdqa %xmm0, %xmm3
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: por %xmm0, %xmm3
-; GFNISSE-NEXT: pxor %xmm0, %xmm0
-; GFNISSE-NEXT: psubb %xmm1, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; GFNISSE-NEXT: movdqa %xmm2, %xmm3
; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: por %xmm1, %xmm3
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: paddb %xmm2, %xmm3
-; GFNISSE-NEXT: por %xmm1, %xmm3
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
; GFNISSE-NEXT: movdqa %xmm2, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1OR2-LABEL: var_rotr_v16i8:
; GFNIAVX1OR2: # %bb.0:
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; GFNIAVX1OR2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX1OR2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; GFNIAVX1OR2-NEXT: vpsllw $5, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT: vpor %xmm2, %xmm3, %xmm2
; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
; GFNIAVX1OR2-NEXT: retq
@@ -415,55 +385,36 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
; GFNISSE: # %bb.0:
; GFNISSE-NEXT: movdqa %xmm2, %xmm4
; GFNISSE-NEXT: movdqa %xmm0, %xmm2
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm6 = [16909320,16909320]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm7
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm7
-; GFNISSE-NEXT: por %xmm0, %xmm7
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT: movdqa %xmm0, %xmm6
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm6
; GFNISSE-NEXT: psllw $5, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm7, %xmm2
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm0
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm9
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm9
-; GFNISSE-NEXT: por %xmm0, %xmm9
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
+; GFNISSE-NEXT: movdqa %xmm2, %xmm7
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm7
; GFNISSE-NEXT: paddb %xmm4, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm9, %xmm2
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
-; GFNISSE-NEXT: movdqa %xmm2, %xmm10
-; GFNISSE-NEXT: paddb %xmm2, %xmm10
-; GFNISSE-NEXT: por %xmm0, %xmm10
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm7, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNISSE-NEXT: movdqa %xmm2, %xmm8
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm8
; GFNISSE-NEXT: paddb %xmm4, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm2
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2
; GFNISSE-NEXT: movdqa %xmm1, %xmm4
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm4
-; GFNISSE-NEXT: por %xmm0, %xmm4
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm4
; GFNISSE-NEXT: psllw $5, %xmm3
; GFNISSE-NEXT: movdqa %xmm3, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm0
; GFNISSE-NEXT: movdqa %xmm1, %xmm4
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm4
-; GFNISSE-NEXT: por %xmm0, %xmm4
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm4
; GFNISSE-NEXT: paddb %xmm3, %xmm3
; GFNISSE-NEXT: movdqa %xmm3, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0
; GFNISSE-NEXT: movdqa %xmm1, %xmm4
-; GFNISSE-NEXT: paddb %xmm1, %xmm4
-; GFNISSE-NEXT: por %xmm0, %xmm4
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm4
; GFNISSE-NEXT: paddb %xmm3, %xmm3
; GFNISSE-NEXT: movdqa %xmm3, %xmm0
; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm1
@@ -473,45 +424,29 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
; GFNIAVX1-LABEL: var_rotl_v32i8:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
; GFNIAVX1-NEXT: # xmm3 = mem[0,0]
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0]
-; GFNIAVX1-NEXT: # xmm5 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6
-; GFNIAVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
-; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; GFNIAVX1-NEXT: vpsllw $5, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64]
; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNIAVX1-NEXT: # xmm8 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9
-; GFNIAVX1-NEXT: vpor %xmm7, %xmm9, %xmm7
-; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNIAVX1-NEXT: # xmm7 = mem[0,0]
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm9
-; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm10
-; GFNIAVX1-NEXT: vpor %xmm9, %xmm10, %xmm9
-; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm9, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128]
+; GFNIAVX1-NEXT: # xmm6 = mem[0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm2, %xmm2
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm0, %xmm3
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm0, %xmm5
-; GFNIAVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
; GFNIAVX1-NEXT: vpsllw $5, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm0, %xmm3
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm0, %xmm3
-; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm0, %xmm3
; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -519,38 +454,26 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
;
; GFNIAVX2-LABEL: var_rotl_v32i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512VL-LABEL: var_rotl_v32i8:
; GFNIAVX512VL: # %bb.0:
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: retq
@@ -575,108 +498,70 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_rotr_v32i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm0, %xmm5
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm0
-; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm7 = [16909320,16909320]
-; GFNISSE-NEXT: movdqa %xmm5, %xmm8
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm8
-; GFNISSE-NEXT: por %xmm0, %xmm8
-; GFNISSE-NEXT: pxor %xmm4, %xmm4
-; GFNISSE-NEXT: pxor %xmm0, %xmm0
-; GFNISSE-NEXT: psubb %xmm2, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm5
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64]
-; GFNISSE-NEXT: movdqa %xmm5, %xmm9
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm9
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0]
-; GFNISSE-NEXT: movdqa %xmm5, %xmm10
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm10
-; GFNISSE-NEXT: por %xmm9, %xmm10
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm5
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128]
-; GFNISSE-NEXT: movdqa %xmm5, %xmm10
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10
-; GFNISSE-NEXT: movdqa %xmm5, %xmm11
-; GFNISSE-NEXT: paddb %xmm5, %xmm11
-; GFNISSE-NEXT: por %xmm10, %xmm11
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm5
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm6
-; GFNISSE-NEXT: por %xmm0, %xmm6
-; GFNISSE-NEXT: psubb %xmm3, %xmm4
+; GFNISSE-NEXT: movdqa %xmm2, %xmm4
+; GFNISSE-NEXT: movdqa %xmm0, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16]
+; GFNISSE-NEXT: movdqa %xmm0, %xmm6
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm6
; GFNISSE-NEXT: psllw $5, %xmm4
; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm0
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0
-; GFNISSE-NEXT: movdqa %xmm1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/189986
More information about the llvm-commits
mailing list