[llvm] 6457455 - [SVE] Use NEON for extract_vector_elt when the index is in range.
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 21 05:15:52 PDT 2020
Author: Paul Walker
Date: 2020-09-21T13:12:28+01:00
New Revision: 6457455248d5b83a7e4274f06b6313b15cd51421
URL: https://github.com/llvm/llvm-project/commit/6457455248d5b83a7e4274f06b6313b15cd51421
DIFF: https://github.com/llvm/llvm-project/commit/6457455248d5b83a7e4274f06b6313b15cd51421.diff
LOG: [SVE] Use NEON for extract_vector_elt when the index is in range.
Patch also adds missing patterns for unpacked vector types and
extracts of element zero.
Differential Revision: https://reviews.llvm.org/D87842
Added:
Modified:
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/test/CodeGen/AArch64/sve-extract-element.ll
llvm/test/CodeGen/AArch64/sve-insert-element.ll
llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index fbe4b01a259a..3ec35cde6071 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -483,15 +483,6 @@ let Predicates = [HasSVE] in {
defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_p>;
defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>;
- // Use more efficient NEON instructions to extract elements within the NEON
- // part (first 128bits) of an SVE register.
- def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
- (f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>;
- def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
- (f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>;
- def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
- (f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>;
-
// Splat immediate (unpredicated)
defm DUP_ZI : sve_int_dup_imm<"dup">;
defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
@@ -2162,6 +2153,28 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
(DUP_ZR_D $index)),
$src)>;
+ // Extract element from vector with scalar index
+ def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
+ (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_H (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_H (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_S (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+ def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
+ (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+
// Extract element from vector with immediate index
def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
(EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
@@ -2173,34 +2186,54 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
(EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+ def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+ def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>;
+ def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>;
def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
(EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+ def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), ssub)>;
def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
- // Extract element from vector with scalar index
- def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
- (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
- (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
- (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
- (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
- ZPR:$vec)>;
+ // Extract element from vector with immediate index that's within the bottom 128-bits.
+ let AddedComplexity = 1 in {
+ def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)),
+ (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+ def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)),
+ (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+ def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
+ (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+ def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)),
+ (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>;
+ }
- def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
- (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
- (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
- ZPR:$vec)>;
- def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
- (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
- ZPR:$vec)>;
+ // Extract first element from vector.
+ let AddedComplexity = 2 in {
+ def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)),
+ (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv8i16 ZPR:$Zs), (i64 0)),
+ (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv4i32 ZPR:$Zs), (i64 0)),
+ (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv2i64 ZPR:$Zs), (i64 0)),
+ (i64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+ def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
+ (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+ def : Pat<(vector_extract (nxv4f16 ZPR:$Zs), (i64 0)),
+ (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+ def : Pat<(vector_extract (nxv2f16 ZPR:$Zs), (i64 0)),
+ (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+ def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
+ (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv2f32 ZPR:$Zs), (i64 0)),
+ (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+ def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
+ (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+ }
}
let Predicates = [HasSVE, HasMatMulInt8] in {
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index cd40f66a16c9..e8fba47ad5eb 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -1,69 +1,125 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: llc < %s 2>%t | FileCheck %s
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
; WARN-NOT: warning
-define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) {
+target triple = "aarch64-unknown-linux-gnu"
+
+define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane0_16xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.b, b0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%b = extractelement <vscale x 16 x i8> %a, i32 0
ret i8 %b
}
-define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) {
+define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.b[15]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 15
+ ret i8 %b
+}
+
+define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.b, z0.b[16]
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 16 x i8> %a, i32 16
+ ret i8 %b
+}
+
+define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: test_lane0_8xi16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.h, h0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%b = extractelement <vscale x 8 x i16> %a, i32 0
ret i16 %b
}
-define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) {
+define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: umov w0, v0.h[7]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 7
+ ret i16 %b
+}
+
+define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, z0.h[8]
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x i16> %a, i32 8
+ ret i16 %b
+}
+
+define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: test_lane0_4xi32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, s0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%b = extractelement <vscale x 4 x i32> %a, i32 0
ret i32 %b
}
-define i64 @test_lane0_2xi64(<vscale x 2 x i64> %a) {
+define i32 @test_lane3_4xi32(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: test_lane3_4xi32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, v0.s[3]
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x i32> %a, i32 3
+ ret i32 %b
+}
+
+define i32 @test_lane4_4xi32(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: test_lane4_4xi32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.s, z0.s[4]
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x i32> %a, i32 4
+ ret i32 %b
+}
+
+define i64 @test_lane0_2xi64(<vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: test_lane0_2xi64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.d, d0
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
%b = extractelement <vscale x 2 x i64> %a, i32 0
ret i64 %b
}
-define double @test_lane0_2xf64(<vscale x 2 x double> %a) {
-; CHECK-LABEL: test_lane0_2xf64:
+define i64 @test_lane1_2xi64(<vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: test_lane1_2xi64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: mov x0, v0.d[1]
; CHECK-NEXT: ret
- %b = extractelement <vscale x 2 x double> %a, i32 0
- ret double %b
+ %b = extractelement <vscale x 2 x i64> %a, i32 1
+ ret i64 %b
}
-define float @test_lane0_4xf32(<vscale x 4 x float> %a) {
-; CHECK-LABEL: test_lane0_4xf32:
+define i64 @test_lane2_2xi64(<vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: test_lane2_2xi64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: mov z0.d, z0.d[2]
+; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
- %b = extractelement <vscale x 4 x float> %a, i32 0
- ret float %b
+ %b = extractelement <vscale x 2 x i64> %a, i32 2
+ ret i64 %b
}
-define half @test_lane0_8xf16(<vscale x 8 x half> %a) {
+define half @test_lane0_8xf16(<vscale x 8 x half> %a) #0 {
; CHECK-LABEL: test_lane0_8xf16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
@@ -72,7 +128,172 @@ define half @test_lane0_8xf16(<vscale x 8 x half> %a) {
ret half %b
}
-define i8 @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
+define half @test_lane7_8xf16(<vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: test_lane7_8xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, z0.h[7]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x half> %a, i32 7
+ ret half %b
+}
+
+define half @test_lane8_8xf16(<vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: test_lane8_8xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.h, z0.h[8]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 8 x half> %a, i32 8
+ ret half %b
+}
+
+define half @test_lane0_4xf16(<vscale x 4 x half> %a) #0 {
+; CHECK-LABEL: test_lane0_4xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x half> %a, i32 0
+ ret half %b
+}
+
+define half @test_lane3_4xf16(<vscale x 4 x half> %a) #0 {
+; CHECK-LABEL: test_lane3_4xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x half> %a, i32 3
+ ret half %b
+}
+
+define half @test_lane4_4xf16(<vscale x 4 x half> %a) #0 {
+; CHECK-LABEL: test_lane4_4xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.s, z0.s[4]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x half> %a, i32 4
+ ret half %b
+}
+
+define half @test_lane0_2xf16(<vscale x 2 x half> %a) #0 {
+; CHECK-LABEL: test_lane0_2xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x half> %a, i32 0
+ ret half %b
+}
+
+define half @test_lane1_2xf16(<vscale x 2 x half> %a) #0 {
+; CHECK-LABEL: test_lane1_2xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x half> %a, i32 1
+ ret half %b
+}
+
+define half @test_lane2_2xf16(<vscale x 2 x half> %a) #0 {
+; CHECK-LABEL: test_lane2_2xf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, z0.d[2]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x half> %a, i32 2
+ ret half %b
+}
+
+define float @test_lane0_4xf32(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: test_lane0_4xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x float> %a, i32 0
+ ret float %b
+}
+
+define float @test_lane3_4xf32(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: test_lane3_4xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x float> %a, i32 3
+ ret float %b
+}
+
+define float @test_lane4_4xf32(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: test_lane4_4xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.s, z0.s[4]
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 4 x float> %a, i32 4
+ ret float %b
+}
+
+define float @test_lane0_2xf32(<vscale x 2 x float> %a) #0 {
+; CHECK-LABEL: test_lane0_2xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x float> %a, i32 0
+ ret float %b
+}
+
+define float @test_lane1_2xf32(<vscale x 2 x float> %a) #0 {
+; CHECK-LABEL: test_lane1_2xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x float> %a, i32 1
+ ret float %b
+}
+
+define float @test_lane2_2xf32(<vscale x 2 x float> %a) #0 {
+; CHECK-LABEL: test_lane2_2xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, z0.d[2]
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x float> %a, i32 2
+ ret float %b
+}
+
+define double @test_lane0_2xf64(<vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: test_lane0_2xf64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x double> %a, i32 0
+ ret double %b
+}
+
+define double @test_lane1_2xf64(<vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: test_lane1_2xf64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x double> %a, i32 1
+ ret double %b
+}
+
+define double @test_lane2_2xf64(<vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: test_lane2_2xf64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z0.d, z0.d[2]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x double> %a, i32 2
+ ret double %b
+}
+
+define i8 @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) #0 {
; CHECK-LABEL: test_lanex_16xi8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
@@ -84,7 +305,7 @@ define i8 @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
ret i8 %b
}
-define i16 @test_lanex_8xi16(<vscale x 8 x i16> %a, i32 %x) {
+define i16 @test_lanex_8xi16(<vscale x 8 x i16> %a, i32 %x) #0 {
; CHECK-LABEL: test_lanex_8xi16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
@@ -96,7 +317,7 @@ define i16 @test_lanex_8xi16(<vscale x 8 x i16> %a, i32 %x) {
ret i16 %b
}
-define i32 @test_lanex_4xi32(<vscale x 4 x i32> %a, i32 %x) {
+define i32 @test_lanex_4xi32(<vscale x 4 x i32> %a, i32 %x) #0 {
; CHECK-LABEL: test_lanex_4xi32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
@@ -108,7 +329,7 @@ define i32 @test_lanex_4xi32(<vscale x 4 x i32> %a, i32 %x) {
ret i32 %b
}
-define i64 @test_lanex_2xi64(<vscale x 2 x i64> %a, i32 %x) {
+define i64 @test_lanex_2xi64(<vscale x 2 x i64> %a, i32 %x) #0 {
; CHECK-LABEL: test_lanex_2xi64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
@@ -120,77 +341,89 @@ define i64 @test_lanex_2xi64(<vscale x 2 x i64> %a, i32 %x) {
ret i64 %b
}
-define double @test_lanex_2xf64(<vscale x 2 x double> %a, i32 %x) {
-; CHECK-LABEL: test_lanex_2xf64:
+define half @test_lanex_8xf16(<vscale x 8 x half> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_8xf16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-NEXT: sxtw x8, w0
-; CHECK-NEXT: whilels p0.d, xzr, x8
-; CHECK-NEXT: lastb d0, p0, z0.d
+; CHECK-NEXT: whilels p0.h, xzr, x8
+; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: ret
- %b = extractelement <vscale x 2 x double> %a, i32 %x
- ret double %b
+ %b = extractelement <vscale x 8 x half> %a, i32 %x
+ ret half %b
}
-define float @test_lanex_4xf32(<vscale x 4 x float> %a, i32 %x) {
-; CHECK-LABEL: test_lanex_4xf32:
+define half @test_lanex_4xf16(<vscale x 4 x half> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_4xf16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-NEXT: sxtw x8, w0
; CHECK-NEXT: whilels p0.s, xzr, x8
-; CHECK-NEXT: lastb s0, p0, z0.s
+; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: ret
- %b = extractelement <vscale x 4 x float> %a, i32 %x
- ret float %b
+ %b = extractelement <vscale x 4 x half> %a, i32 %x
+ ret half %b
}
-define half @test_lanex_8xf16(<vscale x 8 x half> %a, i32 %x) {
-; CHECK-LABEL: test_lanex_8xf16:
+define half @test_lanex_2xf16(<vscale x 2 x half> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_2xf16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-NEXT: sxtw x8, w0
-; CHECK-NEXT: whilels p0.h, xzr, x8
+; CHECK-NEXT: whilels p0.d, xzr, x8
; CHECK-NEXT: lastb h0, p0, z0.h
; CHECK-NEXT: ret
- %b = extractelement <vscale x 8 x half> %a, i32 %x
+ %b = extractelement <vscale x 2 x half> %a, i32 %x
ret half %b
}
-; Deliberately choose an index that is out-of-bounds
-define i8 @test_lane64_16xi8(<vscale x 16 x i8> %a) {
-; CHECK-LABEL: test_lane64_16xi8:
+define float @test_lanex_4xf32(<vscale x 4 x float> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_4xf32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #64
-; CHECK-NEXT: whilels p0.b, xzr, x8
-; CHECK-NEXT: lastb w0, p0, z0.b
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: whilels p0.s, xzr, x8
+; CHECK-NEXT: lastb s0, p0, z0.s
; CHECK-NEXT: ret
- %b = extractelement <vscale x 16 x i8> %a, i32 64
- ret i8 %b
+ %b = extractelement <vscale x 4 x float> %a, i32 %x
+ ret float %b
+}
+
+define float @test_lanex_2xf32(<vscale x 2 x float> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_2xf32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
+; CHECK-NEXT: whilels p0.d, xzr, x8
+; CHECK-NEXT: lastb s0, p0, z0.s
+; CHECK-NEXT: ret
+ %b = extractelement <vscale x 2 x float> %a, i32 %x
+ ret float %b
}
-define double @test_lane9_2xf64(<vscale x 2 x double> %a) {
-; CHECK-LABEL: test_lane9_2xf64:
+define double @test_lanex_2xf64(<vscale x 2 x double> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_2xf64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #9
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x8, w0
; CHECK-NEXT: whilels p0.d, xzr, x8
; CHECK-NEXT: lastb d0, p0, z0.d
; CHECK-NEXT: ret
- %b = extractelement <vscale x 2 x double> %a, i32 9
+ %b = extractelement <vscale x 2 x double> %a, i32 %x
ret double %b
}
; Deliberately choose an index that is undefined
-define i32 @test_lane64_4xi32(<vscale x 4 x i32> %a) {
-; CHECK-LABEL: test_lane64_4xi32:
+define i32 @test_undef_lane_4xi32(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: test_undef_lane_4xi32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, s0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%b = extractelement <vscale x 4 x i32> %a, i32 undef
ret i32 %b
}
-define i8 @extract_of_insert_undef_16xi8(i8 %a) {
+define i8 @extract_of_insert_undef_16xi8(i8 %a) #0 {
; CHECK-LABEL: extract_of_insert_undef_16xi8:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
@@ -199,7 +432,7 @@ define i8 @extract_of_insert_undef_16xi8(i8 %a) {
ret i8 %c
}
-define i8 @extract0_of_insert0_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+define i8 @extract0_of_insert0_16xi8(<vscale x 16 x i8> %a, i8 %b) #0 {
; CHECK-LABEL: extract0_of_insert0_16xi8:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
@@ -208,7 +441,7 @@ define i8 @extract0_of_insert0_16xi8(<vscale x 16 x i8> %a, i8 %b) {
ret i8 %d
}
-define i8 @extract64_of_insert64_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+define i8 @extract64_of_insert64_16xi8(<vscale x 16 x i8> %a, i8 %b) #0 {
; CHECK-LABEL: extract64_of_insert64_16xi8:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
@@ -217,18 +450,17 @@ define i8 @extract64_of_insert64_16xi8(<vscale x 16 x i8> %a, i8 %b) {
ret i8 %d
}
-define i8 @extract_of_insert_
diff _lanes_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+define i8 @extract_of_insert_
diff _lanes_16xi8(<vscale x 16 x i8> %a, i8 %b) #0 {
; CHECK-LABEL: extract_of_insert_
diff _lanes_16xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.b, z0.b[3]
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: umov w0, v0.b[3]
; CHECK-NEXT: ret
%c = insertelement <vscale x 16 x i8> %a, i8 %b, i32 0
%d = extractelement <vscale x 16 x i8> %c, i32 3
ret i8 %d
}
-define i8 @test_lane0_zero_16xi8(<vscale x 16 x i8> %a) {
+define i8 @test_lane0_zero_16xi8(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: test_lane0_zero_16xi8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w0, wzr
@@ -240,7 +472,7 @@ define i8 @test_lane0_zero_16xi8(<vscale x 16 x i8> %a) {
; The DAG combiner should fold the extract of a splat to give element zero
; of the splat, i.e. %x. If the index is beyond the end of the scalable
; vector the result is undefined anyway.
-define i64 @test_lanex_splat_2xi64(i64 %x, i32 %y) {
+define i64 @test_lanex_splat_2xi64(i64 %x, i32 %y) #0 {
; CHECK-LABEL: test_lanex_splat_2xi64:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
@@ -249,3 +481,5 @@ define i64 @test_lanex_splat_2xi64(i64 %x, i32 %y) {
%c = extractelement <vscale x 2 x i64> %b, i32 %y
ret i64 %c
}
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
index a2a4a8e1ba74..cbe4b9391f07 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@@ -182,9 +182,8 @@ define <vscale x 16 x i8> @test_lane0_undef_16xi8(i8 %a) {
define <vscale x 16 x i8> @test_insert0_of_extract0_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_insert0_of_extract0_16xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.b, b1
-; CHECK-NEXT: ptrue p0.b, vl1
; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: ptrue p0.b, vl1
; CHECK-NEXT: mov z0.b, p0/m, w8
; CHECK-NEXT: ret
%c = extractelement <vscale x 16 x i8> %b, i32 0
@@ -212,14 +211,13 @@ define <vscale x 16 x i8> @test_insert64_of_extract64_16xi8(<vscale x 16 x i8> %
define <vscale x 16 x i8> @test_insert3_of_extract1_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_insert3_of_extract1_16xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.b, z1.b[1]
-; CHECK-NEXT: mov w8, #3
-; CHECK-NEXT: index z2.b, #0, #1
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: mov z1.b, w8
+; CHECK-NEXT: mov w9, #3
+; CHECK-NEXT: umov w8, v1.b[1]
+; CHECK-NEXT: index z1.b, #0, #1
+; CHECK-NEXT: mov z2.b, w9
; CHECK-NEXT: ptrue p0.b
-; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z1.b
-; CHECK-NEXT: mov z0.b, p0/m, w9
+; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT: mov z0.b, p0/m, w8
; CHECK-NEXT: ret
%c = extractelement <vscale x 16 x i8> %b, i32 1
%d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 3
diff --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
index aa01dae05512..75c91f76f965 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
@@ -127,8 +127,7 @@ define i64 @split_extract_8i64_idx(<vscale x 8 x i64> %a, i32 %idx) {
define i16 @promote_extract_4i16(<vscale x 4 x i16> %a) {
; CHECK-LABEL: promote_extract_4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, z0.s[1]
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: mov w0, v0.s[1]
; CHECK-NEXT: ret
%ext = extractelement <vscale x 4 x i16> %a, i32 1
ret i16 %ext
@@ -137,8 +136,7 @@ define i16 @promote_extract_4i16(<vscale x 4 x i16> %a) {
define i8 @split_extract_32i8(<vscale x 32 x i8> %a) {
; CHECK-LABEL: split_extract_32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.b, z0.b[3]
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: umov w0, v0.b[3]
; CHECK-NEXT: ret
%ext = extractelement <vscale x 32 x i8> %a, i32 3
ret i8 %ext
More information about the llvm-commits
mailing list