[llvm] [AArch64] Improve cost model for legal subvec insert/extract (PR #81135)

Fri Feb 9 01:33:07 PST 2024

================
@@ -568,6 +568,32 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     return Cost;
   }
+  case Intrinsic::vector_extract: {
+    // If both the vector argument and the return type are legal types, then
+    // this should be a no-op or simple operation; return a relatively low cost.
+    LLVMContext &C = RetTy->getContext();
+    EVT MRTy = getTLI()->getValueType(DL, RetTy);
+    EVT MPTy = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    TargetLoweringBase::LegalizeKind RLK = getTLI()->getTypeConversion(C, MRTy);
+    TargetLoweringBase::LegalizeKind PLK = getTLI()->getTypeConversion(C, MPTy);
+    if (RLK.first == TargetLoweringBase::TypeLegal &&
+        PLK.first == TargetLoweringBase::TypeLegal)
+      return InstructionCost(1);
----------------
david-arm wrote:

At first glance this seems too optimistic because there are cases where the cost is higher. See llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll for some examples:

```
define <vscale x 4 x i1> @extract_nxv4i1_nxv16i1_0(<vscale x 16 x i1> %in) {
; CHECK-LABEL: extract_nxv4i1_nxv16i1_0:
; CHECK:       // %bb.0:
; CHECK-NEXT:    punpklo p0.h, p0.b
; CHECK-NEXT:    punpklo p0.h, p0.b
; CHECK-NEXT:    ret
  %res = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> %in, i64 0)
  ret <vscale x 4 x i1> %res
}
```

or

```
define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_0(<vscale x 16 x i1> %in) {
; CHECK-LABEL: extract_nxv2i1_nxv16i1_0:
; CHECK:       // %bb.0:
; CHECK-NEXT:    punpklo p0.h, p0.b
; CHECK-NEXT:    punpklo p0.h, p0.b
; CHECK-NEXT:    punpklo p0.h, p0.b
; CHECK-NEXT:    ret
  %res = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1> %in, i64 0)
  ret <vscale x 2 x i1> %res
}
```

These should have a higher cost than 1 I think. Or what about mixing legal fixed-width and scalable vector types such as:

```
define void @foo(ptr %a, ptr %b) vscale_range(16,0) {
  %op = load <vscale x 8 x i16>, ptr %a
  %ret = call <4 x i16> @llvm.vector.extract.v8i16.v16i16(<vscale x 8 x i16> %op, i64 4)
  store <4 x i16> %ret, ptr %b
  ret void
}
```

which leads to this assembly output:

```
foo:                                    // @foo
	str	x29, [sp, #-16]!                // 8-byte Folded Spill
	addvl	sp, sp, #-1
	ptrue	p0.h
	ld1h	{ z0.h }, p0/z, [x0]
	st1h	{ z0.h }, p0, [sp]
	ldr	d0, [sp, #8]
	str	d0, [x1]
	addvl	sp, sp, #1
	ldr	x29, [sp], #16                  // 8-byte Folded Reload
	ret
```


https://github.com/llvm/llvm-project/pull/81135