[llvm] 0f652d8 - [X86] LowerRotate - recognise hidden ROTR patterns for better vXi8 codegen
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 19 03:52:32 PST 2021
Author: Simon Pilgrim
Date: 2021-11-19T11:49:15Z
New Revision: 0f652d8f527f3743771c8ad70f47d1019cb7ca1a
URL: https://github.com/llvm/llvm-project/commit/0f652d8f527f3743771c8ad70f47d1019cb7ca1a
DIFF: https://github.com/llvm/llvm-project/commit/0f652d8f527f3743771c8ad70f47d1019cb7ca1a.diff
LOG: [X86] LowerRotate - recognise hidden ROTR patterns for better vXi8 codegen
Check for a hidden ISD::ROTR (rotl(sub(0,x))) - vXi8 lowering can handle both (its always beneficial for splats, but otherwise only if we have VPTERNLOG).
We currently hit infinite loops in TargetLowering::expandROT if we set ISD::ROTR to custom, which needs addressing before we extend this much further.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 520464a4056c8..66e7260118b6c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29854,20 +29854,30 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
return SDValue();
+ // Check for a hidden ISD::ROTR, vXi8 lowering can handle both, but we
+ // currently hit infinite loops in legalization if we allow ISD::ROTR.
+ // FIXME: Infinite ROTL<->ROTR legalization in TargetLowering::expandROT.
+ SDValue HiddenROTRAmt;
+ if (Amt.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(Amt.getOperand(0).getNode()))
+ HiddenROTRAmt = Amt.getOperand(1);
+
MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
// If the amount is a splat, attempt to fold as unpack(x,x) << zext(y):
// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
- if (SDValue BaseRotAmt =
- DAG.getSplatValue(DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask))) {
+ // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
+ if (SDValue BaseRotAmt = DAG.getSplatValue(DAG.getNode(
+ ISD::AND, DL, VT, HiddenROTRAmt ? HiddenROTRAmt : Amt, AmtMask))) {
+ unsigned ShiftX86Opc = HiddenROTRAmt ? X86ISD::VSRLI : X86ISD::VSHLI;
BaseRotAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseRotAmt);
SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
- Lo = getTargetVShiftNode(X86ISD::VSHLI, DL, ExtVT, Lo, BaseRotAmt,
+ Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
Subtarget, DAG);
- Hi = getTargetVShiftNode(X86ISD::VSHLI, DL, ExtVT, Hi, BaseRotAmt,
+ Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
Subtarget, DAG);
- return getPack(DAG, Subtarget, DL, VT, Lo, Hi, /*PackHiHalf */ true);
+ return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !HiddenROTRAmt);
}
// We don't need ModuloAmt here as we just peek at individual bits.
@@ -29889,6 +29899,15 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getSelect(DL, SelVT, C, V0, V1);
};
+ // 'Hidden' ROTR is currently only profitable on AVX512 targets where we
+ // have VPTERNLOG.
+ unsigned ShiftLHS = ISD::SHL;
+ unsigned ShiftRHS = ISD::SRL;
+ if (HiddenROTRAmt && useVPTERNLOG(Subtarget, VT)) {
+ std::swap(ShiftLHS, ShiftRHS);
+ Amt = HiddenROTRAmt;
+ }
+
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
// We can safely do this using i16 shifts as we're only interested in
// the 3 lower bits of each byte.
@@ -29900,8 +29919,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
SDValue M;
M = DAG.getNode(
ISD::OR, DL, VT,
- DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
- DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
+ DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
+ DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
R = SignBitSelect(VT, Amt, M, R);
// a += a
@@ -29910,8 +29929,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
// r = VSELECT(r, rot(r, 2), a);
M = DAG.getNode(
ISD::OR, DL, VT,
- DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
- DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
+ DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
+ DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
R = SignBitSelect(VT, Amt, M, R);
// a += a
@@ -29920,8 +29939,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
// return VSELECT(r, rot(r, 1), a);
M = DAG.getNode(
ISD::OR, DL, VT,
- DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
- DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
+ DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
+ DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
return SignBitSelect(VT, Amt, M, R);
}
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index baabaef0fa944..7aff8a3e67357 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1195,47 +1195,44 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: psubb %xmm1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT: psllw %xmm2, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlw %xmm1, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psllw %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psrlw %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: psubb %xmm1, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE41-NEXT: psllw %xmm2, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE41-NEXT: psrlw %xmm1, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm3, %xmm2
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: psllw %xmm2, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: psrlw %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm3, %xmm0
+; SSE41-NEXT: packuswb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: splatvar_funnnel_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpsllw %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
;
@@ -1349,19 +1346,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
;
; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pxor %xmm2, %xmm2
-; X86-SSE2-NEXT: psubb %xmm1, %xmm2
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; X86-SSE2-NEXT: psllw %xmm2, %xmm1
-; X86-SSE2-NEXT: psrlw $8, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-SSE2-NEXT: psrlw %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: psllw %xmm2, %xmm0
-; X86-SSE2-NEXT: psrlw $8, %xmm0
-; X86-SSE2-NEXT: packuswb %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index a7338b8121691..4f3e9100d7944 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -490,43 +490,38 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
;
; AVX512F-LABEL: var_funnnel_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
-; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
+; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
-; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
+; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3
+; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
@@ -975,70 +970,65 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX1-LABEL: splatvar_funnnel_v32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v32i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v32i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
More information about the llvm-commits
mailing list