[llvm] [AArch64] Improve cost model for legal subvec insert/extract (PR #81135)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 9 01:33:07 PST 2024
================
@@ -568,6 +568,32 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
return Cost;
}
+ case Intrinsic::vector_extract: {
+ // If both the vector argument and the return type are legal types, then
+ // this should be a no-op or simple operation; return a relatively low cost.
+ LLVMContext &C = RetTy->getContext();
+ EVT MRTy = getTLI()->getValueType(DL, RetTy);
+ EVT MPTy = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ TargetLoweringBase::LegalizeKind RLK = getTLI()->getTypeConversion(C, MRTy);
+ TargetLoweringBase::LegalizeKind PLK = getTLI()->getTypeConversion(C, MPTy);
+ if (RLK.first == TargetLoweringBase::TypeLegal &&
+ PLK.first == TargetLoweringBase::TypeLegal)
+ return InstructionCost(1);
----------------
david-arm wrote:
At first glance this seems too optimistic because there are cases where the cost is higher. See llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll for some examples:
```
define <vscale x 4 x i1> @extract_nxv4i1_nxv16i1_0(<vscale x 16 x i1> %in) {
; CHECK-LABEL: extract_nxv4i1_nxv16i1_0:
; CHECK: // %bb.0:
; CHECK-NEXT: punpklo p0.h, p0.b
; CHECK-NEXT: punpklo p0.h, p0.b
; CHECK-NEXT: ret
%res = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> %in, i64 0)
ret <vscale x 4 x i1> %res
}
```
or
```
define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_0(<vscale x 16 x i1> %in) {
; CHECK-LABEL: extract_nxv2i1_nxv16i1_0:
; CHECK: // %bb.0:
; CHECK-NEXT: punpklo p0.h, p0.b
; CHECK-NEXT: punpklo p0.h, p0.b
; CHECK-NEXT: punpklo p0.h, p0.b
; CHECK-NEXT: ret
%res = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1> %in, i64 0)
ret <vscale x 2 x i1> %res
}
```
These should have a higher cost than 1 I think. Or what about mixing legal fixed-width and scalable vector types such as:
```
define void @foo(ptr %a, ptr %b) vscale_range(16,0) {
%op = load <vscale x 8 x i16>, ptr %a
%ret = call <4 x i16> @llvm.vector.extract.v8i16.v16i16(<vscale x 8 x i16> %op, i64 4)
store <4 x i16> %ret, ptr %b
ret void
}
```
which leads to this assembly output:
```
foo: // @foo
str x29, [sp, #-16]! // 8-byte Folded Spill
addvl sp, sp, #-1
ptrue p0.h
ld1h { z0.h }, p0/z, [x0]
st1h { z0.h }, p0, [sp]
ldr d0, [sp, #8]
str d0, [x1]
addvl sp, sp, #1
ldr x29, [sp], #16 // 8-byte Folded Reload
ret
```
https://github.com/llvm/llvm-project/pull/81135
More information about the llvm-commits
mailing list