[llvm] [AMDGPU] Vectorize more 16 bit shuffles (PR #90648)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu May 9 08:48:50 PDT 2024
================
@@ -1129,31 +1129,55 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
int Index, VectorType *SubTp,
ArrayRef<const Value *> Args,
const Instruction *CxtI) {
+ if (!isa<FixedVectorType>(VT))
+ return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
+
Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
- // Treat extractsubvector as single op permutation.
- bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
- if (IsExtractSubvector)
- Kind = TTI::SK_PermuteSingleSrc;
-
- if (ST->hasVOP3PInsts()) {
- if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
- DL.getTypeSizeInBits(VT->getElementType()) == 16) {
- // With op_sel VOP3P instructions freely can access the low half or high
- // half of a register, so any swizzle is free.
- switch (Kind) {
- case TTI::SK_Broadcast:
- case TTI::SK_Reverse:
- case TTI::SK_PermuteSingleSrc:
+ // Larger vector widths may require additional instructions, but are
+ // typically cheaper than scalarized versions.
+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+ if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ DL.getTypeSizeInBits(VT->getElementType()) == 16) {
+ bool HasVOP3P = ST->hasVOP3PInsts();
+ unsigned RequestedElts =
+ count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
+ if (RequestedElts == 0)
+ return 0;
+ switch (Kind) {
+ case TTI::SK_Broadcast:
+ case TTI::SK_Reverse:
+ case TTI::SK_PermuteSingleSrc: {
+ // With op_sel VOP3P instructions freely can access the low half or high
+ // half of a register, so any swizzle of two elements is free.
+ if (HasVOP3P && NumVectorElts == 2)
return 0;
- default:
- break;
- }
+ unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+ // SK_Broadcast just reuses the same mask
+ unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
+ return NumPerms + NumPermMasks;
+ }
+ case TTI::SK_ExtractSubvector:
+ case TTI::SK_InsertSubvector: {
+ if (HasVOP3P && NumVectorElts == 2)
+ return 0;
+ // Insert/extract subvectors require only shifts / extract code to get the
+ // relevant bits
+ return alignTo(RequestedElts, 2) / 2;
----------------
arsenm wrote:
There's only a shift if you need to access the high half of a register
https://github.com/llvm/llvm-project/pull/90648
More information about the llvm-commits
mailing list