[llvm-dev] 64 bit mask in x86vshuffle instruction

Wed Apr 11 12:24:11 PDT 2018

I m still unable to solve vshuffle issue. Can you tell me how to split
v64i32 shuffle into 4 v16i32 shuffles?

On Wed, Apr 11, 2018 at 12:14 AM, hameeza ahmed <hahmed2305 at gmail.com>
wrote:

> Please tell me whether the following implementation is correct.....
> My target supports 64 bit mask means immediate(0-2^63)
>
> I have implemented it but i dont know whether its correct or not. Please
> see the changes below that i have made in x86isellowering.cpp
>
> static SDValue lower2048BitVectorShuffle(const SDLoc &DL, ArrayRef<int>
> Mask,
>                                         MVT VT, SDValue V1, SDValue V2,
>                                         const SmallBitVector &Zeroable,
>                                         const X86Subtarget &Subtarget,
>                                         SelectionDAG &DAG) {
>
>
>   // If we have a single input to the zero element, insert that into V1 if
> we
>   // can do so cheaply.
>   int NumElts = Mask.size();
>   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >=
> NumElts; });
>
>   if (NumV2Elements == 1 && Mask[0] >= NumElts)
>     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
>             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
>       return Insertion;
>
>   // Check for being able to broadcast a single element.
>   if (SDValue Broadcast =
>           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget,
> DAG))
>     return Broadcast;
>
>   // Dispatch to each element type for lowering. If we don't have support
> for
>   // specific element type shuffles at 512 bits, immediately split them and
>   // lower them. Each lowering routine of a given type is allowed to
> assume that
>   // the requisite ISA extensions for that element type are available.
>   switch (VT.SimpleTy) {
>   case MVT::v32f64:
>     return lowerV32F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget,
> DAG);
>   case MVT::v64f32:
>     return lowerV64F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget,
> DAG);
>   case MVT::v32i64:
>     return lowerV32I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget,
> DAG);
>   case MVT::v64i32:
>     return lowerV64I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget,
> DAG);
>
>   default:
>     llvm_unreachable("Not a valid P x86 vector type!");
>   }
> }
>
> static SDValue lowerV64I32VectorShuffle(const SDLoc &DL, ArrayRef<int>
> Mask,
>                                         const SmallBitVector &Zeroable,
>                                         SDValue V1, SDValue V2,
>                                         const X86Subtarget &Subtarget,
>                                         SelectionDAG &DAG) {
>
>  assert(V1.getSimpleValueType() == MVT::v64i32 && "Bad operand type!");
>   assert(V2.getSimpleValueType() == MVT::v64i32 && "Bad operand type!");
>   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
>
>   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
>           DL, MVT::v64i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
>     return ZExt;
>
>
>       SmallVector<int, 16> RepeatedMask;
>
>   bool Is128BitLaneRepeatedShuffle =
>       is128BitLaneRepeatedShuffleMask(MVT::v64i32, Mask, RepeatedMask);
>
>   if (Is128BitLaneRepeatedShuffle) {
>   //  assert(RepeatedMask.size() == 16 && "Unexpected repeated mask
> size!");
>
>
>    if (V2.isUndef())
>      {
> return DAG.getNode(X86ISD::PSHUFD_P64, DL, MVT::v64i32, V1,
>                          getV16X86ShuffleImm64ForMask(/*Repeated*/Mask,
> DL, DAG));
>
>
> }
>
>     // Use dedicated unpack instructions for masks that match their
> pattern.
>     if (SDValue V =
>             lowerVectorShuffleWithUNPCK(DL, MVT::v64i32, Mask, V1, V2,
> DAG))
>       return V;
>   }
>
>
>
>   // Try to use shift instructions.
>   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i32, V1, V2,
> Mask,
>                                                 Zeroable, Subtarget, DAG))
>     return Shift;
>
>   // Try to use VALIGN.
>   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v64i32, V1, V2,
>                                                   Mask, Subtarget, DAG))
>     return Rotate;
>
>   // Assume that a single SHUFPS is faster than using a permv shuffle.
>   // If some CPU is harmed by the domain switch, we can fix it in a later
> pass.
>
>   // If we have AVX512F support, we can use VEXPAND.
>   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v64i32, Zeroable,
> Mask,
>                                              V1, V2, DAG, Subtarget))
>     return V;
>
>   return lowerVectorShuffleWithPERMV(DL, MVT::v64i32, Mask, V1, V2, DAG);
>
> }
>
> static SDValue lowerV32I64VectorShuffle(const SDLoc &DL, ArrayRef<int>
> Mask,
>                                        const SmallBitVector &Zeroable,
>                                        SDValue V1, SDValue V2,
>                                        const X86Subtarget &Subtarget,
>                                        SelectionDAG &DAG) {
>   assert(V1.getSimpleValueType() == MVT::v32i64 && "Bad operand type!");
>   assert(V2.getSimpleValueType() == MVT::v32i64 && "Bad operand type!");
>   assert(Mask.size() == 32 && "Unexpected mask size for v8 shuffle!");
>
>   if (SDValue Shuf128 =
>           lowerV16X128VectorShuffle(DL, MVT::v32i64, Mask, V1, V2, DAG))
>     return Shuf128;
>
>   if (V2.isUndef()) {
>     // When the shuffle is mirrored between the 128-bit lanes of the unit,
> we
>     // can use lower latency instructions that will operate on all four
>     // 128-bit lanes.
>     SmallVector<int, 8> Repeated128Mask;
>     if (is128BitLaneRepeatedShuffleMask(MVT::v32i64, Mask,
> Repeated128Mask)) {
>       SmallVector<int, 64> PSHUFDMask;
>       scaleShuffleMask(8, Repeated128Mask, PSHUFDMask);
>       return DAG.getBitcast(
>           MVT::v32i64,
>           DAG.getNode(X86ISD::PSHUFD_P64, DL, MVT::v64i32,
>                       DAG.getBitcast(MVT::v64i32, V1),
>                       getV16X86ShuffleImm64ForMask(PSHUFDMask, DL, DAG)));
>     }
>
>     SmallVector<int, 16> Repeated256Mask;
>     if (is256BitLaneRepeatedShuffleMask(MVT::v32i64, Mask,
> Repeated256Mask))
>       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v32i64, V1,
>                          getV16X86ShuffleImm64ForMask(Repeated256Mask,
> DL, DAG));
>   }
>
>   // Try to use shift instructions.
>   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i64, V1, V2,
> Mask,
>                                                 Zeroable, Subtarget, DAG))
>     return Shift;
>
>   // Try to use VALIGN.
>   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v32i64, V1, V2,
>                                                   Mask, Subtarget, DAG))
>     return Rotate;
>
>   // Try to use PALIGNR.
>   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v32i64,
> V1, V2,
>                                                       Mask, Subtarget,
> DAG))
>     return Rotate;
>
>   if (SDValue Unpck =
>           lowerVectorShuffleWithUNPCK(DL, MVT::v32i64, Mask, V1, V2, DAG))
>     return Unpck;
>   // If we have AVX512F support, we can use VEXPAND.
>   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v32i64, Zeroable,
> Mask, V1,
>                                              V2, DAG, Subtarget))
>     return V;
>
>   return lowerVectorShuffleWithPERMV(DL, MVT::v32i64, Mask, V1, V2, DAG);
> }
>
> static SDValue getV64X86ShuffleImm64ForMask(ArrayRef<int> Mask, SDLoc DL,
>                                           SelectionDAG &DAG) {
>   return DAG.getConstant(getV64X86ShuffleImm(Mask), DL, MVT::i64);
> }
>
>
> static uint64_t getV16X86ShuffleImm(ArrayRef<int> Mask) {
>  // assert(Mask.size() == 16 && "Only 16-lane shuffle masks");
>   assert(Mask[0] >= -1 && Mask[0] < 16 && "Out of bound mask element!");
>   assert(Mask[1] >= -1 && Mask[1] < 16 && "Out of bound mask element!");
>   assert(Mask[2] >= -1 && Mask[2] < 16 && "Out of bound mask element!");
>   assert(Mask[3] >= -1 && Mask[3] < 16 && "Out of bound mask element!");
>   assert(Mask[4] >= -1 && Mask[4] < 16 && "Out of bound mask element!");
>   assert(Mask[5] >= -1 && Mask[5] < 16 && "Out of bound mask element!");
>   assert(Mask[6] >= -1 && Mask[6] < 16 && "Out of bound mask element!");
>   assert(Mask[7] >= -1 && Mask[7] < 16 && "Out of bound mask element!");
>   assert(Mask[8] >= -1 && Mask[8] < 16 && "Out of bound mask element!");
>   assert(Mask[9] >= -1 && Mask[9] < 16 && "Out of bound mask element!");
>   assert(Mask[10] >= -1 && Mask[10] < 16 && "Out of bound mask element!");
>   assert(Mask[11] >= -1 && Mask[11] < 16 && "Out of bound mask element!");
>   assert(Mask[12] >= -1 && Mask[12] < 16 && "Out of bound mask element!");
>   assert(Mask[13] >= -1 && Mask[13] < 16 && "Out of bound mask element!");
>   assert(Mask[14] >= -1 && Mask[14] < 16 && "Out of bound mask element!");
>   assert(Mask[15] >= -1 && Mask[15] < 16 && "Out of bound mask element!");
>
>   uint64_t Imm = 0;
>   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
>   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
>   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
>   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
>   Imm |= (Mask[4] < 0 ? 4 : Mask[4]) << 8;
>   Imm |= (Mask[5] < 0 ? 5 : Mask[5]) << 10;
>   Imm |= (Mask[6] < 0 ? 6 : Mask[6]) << 12;
>   Imm |= (Mask[7] < 0 ? 7 : Mask[7]) << 14;
>    Imm |= (Mask[8] < 0 ? 8 : Mask[8]) << 16;
>   Imm |= (Mask[9] < 0 ? 9 : Mask[9]) << 18;
>   Imm |= (Mask[10] < 0 ? 10 : Mask[10]) << 20;
>   Imm |= (Mask[11] < 0 ? 11 : Mask[11]) << 22;
>    Imm |= (Mask[12] < 0 ? 12 : Mask[12]) << 24;
>   Imm |= (Mask[13] < 0 ? 13 : Mask[13]) << 26;
>   Imm |= (Mask[14] < 0 ? 14 : Mask[14]) << 28;
>   Imm |= (Mask[15] < 0 ? 15 : Mask[15]) << 30;
>
>   return Imm;
> }
>
>
> static SDValue lowerV16X128VectorShuffle(const SDLoc &DL, MVT VT,
>                                         ArrayRef<int> Mask, SDValue V1,
>                                         SDValue V2, SelectionDAG &DAG) {
>   assert(VT.getScalarSizeInBits() == 64 &&
>          "Unexpected element type size for 128bit shuffle.");
>
>   // To handle 256 bit vector requires VLX and most probably
>   // function lowerV2X128VectorShuffle() is better solution.
>   assert(VT.is2048BitVector() && "Unexpected vector size for 2048bit
> shuffle.");
>
>   SmallVector<int, 16> WidenedMask;
>   if (!canWidenShuffleElements(Mask, WidenedMask))
>     return SDValue();
>
>   // Check for patterns which can be matched with a single insert of a
> 256-bit
>   // subvector.
>   bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
>                                         {0, 1, 2, 3, 4, 5, 6, 7,0, 1, 2,
> 3, 4, 5, 6, 7});
>   if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
>                                         {0, 1, 2, 3, 4, 5, 6, 7, 12, 13,
> 14, 15,16,17,18,19}))
>
>  {
>     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 16);
>     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
>                               DAG.getIntPtrConstant(0, DL));
>     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
>                               OnlyUsesV1 ? V1 : V2,
>                               DAG.getIntPtrConstant(0, DL));
>     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
>   }
>
>   assert(WidenedMask.size() == 16);
>
>   // See if this is an insertion of the lower 128-bits of V2 into V1.
>   bool IsInsert = true;
>   int V2Index = -1;
>   for (int i = 0; i < 16; ++i) {
>     assert(WidenedMask[i] >= -1);
>     if (WidenedMask[i] < 0)
>       continue;
>
>     // Make sure all V1 subvectors are in place.
>     if (WidenedMask[i] < 16) {
>       if (WidenedMask[i] != i) {
>         IsInsert = false;
>         break;
>       }
>     } else {
>       // Make sure we only have a single V2 index and its the lowest
> 128-bits.
>       if (V2Index >= 0 || WidenedMask[i] != 16) {
>         IsInsert = false;
>         break;
>       }
>       V2Index = i;
>     }
>   }
>   if (IsInsert && V2Index >= 0) {
>     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
>     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
>                                  DAG.getIntPtrConstant(0, DL));
>     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
>   }
>
>   // Try to lower to to vshuf64x2/vshuf32x4.
>   SDValue Ops[8] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT),
> DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT),
> DAG.getUNDEF(VT)};
>   unsigned PermMask = 0;
>   // Insure elements came from the same Op.
>   for (int i = 0; i < 16; ++i) {
>     assert(WidenedMask[i] >= -1);
>     if (WidenedMask[i] < 0)
>       continue;
>
>     SDValue Op = WidenedMask[i] >= 16 ? V2 : V1;
>     unsigned OpIndex = i / 2;
>     if (Ops[OpIndex].isUndef())
>       Ops[OpIndex] = Op;
>     else if (Ops[OpIndex] != Op)
>       return SDValue();
>
>     // Convert the 128-bit shuffle mask selection values into 128-bit
> selection
>     // bits defined by a vshuf64x2 instruction's immediate control byte.
>     PermMask |= (WidenedMask[i] % 16) << (i * 2);
>   }
>
>   return DAG.getNode(X86ISD::SHUF128_P64, DL, VT, Ops[0], Ops[1],
>                      DAG.getConstant(PermMask, DL, MVT::i64));
> }
>
>
>
> Please help...i m really sorry for asking but i m stuck here..The code
> runs w/o error at the o/p it gives following assembly;
>
>     P_256B_VSHUF64x2_QWORD    R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_1,
> 236 # encoding: []
>     P_256B_VADD_DWORD    R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 #
> encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00]
>     P_256B_VSHUF64x2_QWORD    R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_1,
> 244 # encoding: []
>     P_256B_VADD_DWORD    R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 #
> encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00]
>     P_256B_VSHUF64x2_QWORD    R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_0, 14
> # encoding: []
>     P_256B_VADD_DWORD    R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 #
> encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00]
>     P_256B_VSHUF64x2_QWORD    R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_0, 1
> # encoding: []
>     P_256B_VADD_DWORD    R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 #
> encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00]
>     P_256B_PSHUFFLE_DWORD    R_0_R2048b_0, R_0_R2048b_1, 236 # encoding: []
>     P_256B_VADD_DWORD    R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 #
> encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00]
>     P_256B_PSHUFFLE_DWORD    R_0_R2048b_0, R_0_R2048b_1, 229 # encoding: []
>
> Here the imm are small (0-255) looks like 8 bits....
> What to do? Please help....
>
>
>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20180412/20d8db1a/attachment-0001.html>