[llvm] [LLVM][SVE] Extend dup(extract_elt(v,i)) isel patterns to cover more combinations. (PR #115189)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 8 06:46:34 PST 2024
================
@@ -56,19 +151,133 @@ define <vscale x 8 x half> @dup_extract_f16(<vscale x 8 x half> %data) {
ret <vscale x 8 x half> %.splat
}
-define <vscale x 4 x half> @dup_extract_f16_4(<vscale x 4 x half> %data) {
-; CHECK-LABEL: dup_extract_f16_4:
+define <vscale x 8 x half> @dup_extract_nxv8f16_nxv4f16(<vscale x 4 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv8f16_nxv4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.s, z0.s[1]
; CHECK-NEXT: ret
%1 = extractelement <vscale x 4 x half> %data, i16 1
+ %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
+ %.splat = shufflevector <vscale x 8 x half> %.splatinsert, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+ ret <vscale x 8 x half> %.splat
+}
+
+define <vscale x 8 x half> @dup_extract_nxv8f16_nxv2f16(<vscale x 2 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv8f16_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: ret
+ %1 = extractelement <vscale x 2 x half> %data, i16 1
+ %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
+ %.splat = shufflevector <vscale x 8 x half> %.splatinsert, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+ ret <vscale x 8 x half> %.splat
+}
+
+define <vscale x 8 x half> @dup_extract_nxv8f16_v8f16(<8 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv8f16_v8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: mov z0.h, z0.h[1]
+; CHECK-NEXT: ret
+ %1 = extractelement <8 x half> %data, i16 1
+ %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
+ %.splat = shufflevector <vscale x 8 x half> %.splatinsert, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+ ret <vscale x 8 x half> %.splat
+}
+
+define <vscale x 8 x half> @dup_extract_nxv8f16_v4f16(<4 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv8f16_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: mov z0.h, z0.h[1]
+; CHECK-NEXT: ret
+ %1 = extractelement <4 x half> %data, i16 1
+ %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
+ %.splat = shufflevector <vscale x 8 x half> %.splatinsert, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+ ret <vscale x 8 x half> %.splat
+}
+
+define <vscale x 4 x half> @dup_extract_nxv4f16_nxv8f16(<vscale x 8 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv4f16_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, z0.h[1]
+; CHECK-NEXT: ret
+ %1 = extractelement <vscale x 8 x half> %data, i16 1
+ %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
+ %.splat = shufflevector <vscale x 4 x half> %.splatinsert, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
+ ret <vscale x 4 x half> %.splat
+}
+
+define <vscale x 4 x half> @dup_extract_nxv4f16_nxv4f16(<vscale x 4 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv4f16_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.s, z0.s[1]
+; CHECK-NEXT: ret
+ %1 = extractelement <vscale x 4 x half> %data, i16 1
+ %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
+ %.splat = shufflevector <vscale x 4 x half> %.splatinsert, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
+ ret <vscale x 4 x half> %.splat
+}
+
+define <vscale x 4 x half> @dup_extract_nxv4f16_nxv2f16(<vscale x 2 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv4f16_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, z0.d[1]
----------------
rj-jesus wrote:
Thanks, it looks good now. Have you considered supporting the remaining unpacked cases (the ones you've just removed) by scaling the index explicitly? For example, for `@dup_extract_nxv4f16_nxv2f16`, I believe we could generate:
```
mov z0.h, z0.h[4]
```
(Instead of the two instructions we currently generate.) I don't think this is too important though.
I am, however, confused about something: the unpacked fixed and scalable extracts seem to index into different underlying elements (see [here](https://godbolt.org/z/r65nj8Mx7) for example). This doesn't directly concern this patch, but I'm struggling to find any references as to why this is the case. Do you happen to have any pointers? Many thanks in advance!
https://github.com/llvm/llvm-project/pull/115189
More information about the llvm-commits
mailing list