[llvm] 6457455 - [SVE] Use NEON for extract_vector_elt when the index is in range.

Paul Walker via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 21 05:15:52 PDT 2020


Author: Paul Walker
Date: 2020-09-21T13:12:28+01:00
New Revision: 6457455248d5b83a7e4274f06b6313b15cd51421

URL: https://github.com/llvm/llvm-project/commit/6457455248d5b83a7e4274f06b6313b15cd51421
DIFF: https://github.com/llvm/llvm-project/commit/6457455248d5b83a7e4274f06b6313b15cd51421.diff

LOG: [SVE] Use NEON for extract_vector_elt when the index is in range.

Patch also adds missing patterns for unpacked vector types and
extracts of element zero.

Differential Revision: https://reviews.llvm.org/D87842

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/test/CodeGen/AArch64/sve-extract-element.ll
    llvm/test/CodeGen/AArch64/sve-insert-element.ll
    llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index fbe4b01a259a..3ec35cde6071 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -483,15 +483,6 @@ let Predicates = [HasSVE] in {
   defm FMAXV_VPZ   : sve_fp_fast_red<0b110, "fmaxv",   AArch64fmaxv_p>;
   defm FMINV_VPZ   : sve_fp_fast_red<0b111, "fminv",   AArch64fminv_p>;
 
-  // Use more efficient NEON instructions to extract elements within the NEON
-  // part (first 128bits) of an SVE register.
-  def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
-            (f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>;
-  def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
-            (f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>;
-  def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
-            (f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>;
-
   // Splat immediate (unpredicated)
   defm DUP_ZI   : sve_int_dup_imm<"dup">;
   defm FDUP_ZI  : sve_int_dup_fpimm<"fdup">;
@@ -2162,6 +2153,28 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
                                        (DUP_ZR_D $index)),
                         $src)>;
 
+  // Extract element from vector with scalar index
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_H (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_H (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_S (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>;
+
   // Extract element from vector with immediate index
   def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
@@ -2173,34 +2186,54 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
   def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+  def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>;
+  def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>;
   def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+  def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), ssub)>;
   def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
             (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
 
-  // Extract element from vector with scalar index
-  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
-            (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
-            (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
-            (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
-            (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
-                         ZPR:$vec)>;
+  // Extract element from vector with immediate index that's within the bottom 128-bits.
+  let AddedComplexity = 1 in {
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)),
+            (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)),
+            (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
+            (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)),
+            (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>;
+  }
 
-  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
-            (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
-            (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
-                         ZPR:$vec)>;
-  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
-            (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
-                         ZPR:$vec)>;
+  // Extract first element from vector.
+  let AddedComplexity = 2 in {
+  def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)),
+            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv8i16 ZPR:$Zs), (i64 0)),
+            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv4i32 ZPR:$Zs), (i64 0)),
+            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv2i64 ZPR:$Zs), (i64 0)),
+            (i64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+  def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
+            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+  def : Pat<(vector_extract (nxv4f16 ZPR:$Zs), (i64 0)),
+            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+  def : Pat<(vector_extract (nxv2f16 ZPR:$Zs), (i64 0)),
+            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
+  def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
+            (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv2f32 ZPR:$Zs), (i64 0)),
+            (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
+  def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
+            (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+  }
 }
 
 let Predicates = [HasSVE, HasMatMulInt8] in {

diff  --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index cd40f66a16c9..e8fba47ad5eb 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -1,69 +1,125 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: llc < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
 ; WARN-NOT: warning
 
-define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) {
+target triple = "aarch64-unknown-linux-gnu"
+
+define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: test_lane0_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.b, b0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 16 x i8> %a, i32 0
   ret i8 %b
 }
 
-define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) {
+define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[15]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 15
+  ret i8 %b
+}
+
+define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, z0.b[16]
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 16
+  ret i8 %b
+}
+
+define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: test_lane0_8xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 8 x i16> %a, i32 0
   ret i16 %b
 }
 
-define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) {
+define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[7]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 7
+  ret i16 %b
+}
+
+define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[8]
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 8
+  ret i16 %b
+}
+
+define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: test_lane0_4xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 4 x i32> %a, i32 0
   ret i32 %b
 }
 
-define i64 @test_lane0_2xi64(<vscale x 2 x i64> %a) {
+define i32 @test_lane3_4xi32(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: test_lane3_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, v0.s[3]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x i32> %a, i32 3
+  ret i32 %b
+}
+
+define i32 @test_lane4_4xi32(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: test_lane4_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[4]
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x i32> %a, i32 4
+  ret i32 %b
+}
+
+define i64 @test_lane0_2xi64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: test_lane0_2xi64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 2 x i64> %a, i32 0
   ret i64 %b
 }
 
-define double @test_lane0_2xf64(<vscale x 2 x double> %a) {
-; CHECK-LABEL: test_lane0_2xf64:
+define i64 @test_lane1_2xi64(<vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: test_lane1_2xi64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    mov x0, v0.d[1]
 ; CHECK-NEXT:    ret
-  %b = extractelement <vscale x 2 x double> %a, i32 0
-  ret double %b
+  %b = extractelement <vscale x 2 x i64> %a, i32 1
+  ret i64 %b
 }
 
-define float @test_lane0_4xf32(<vscale x 4 x float> %a) {
-; CHECK-LABEL: test_lane0_4xf32:
+define i64 @test_lane2_2xi64(<vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: test_lane2_2xi64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    mov z0.d, z0.d[2]
+; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
-  %b = extractelement <vscale x 4 x float> %a, i32 0
-  ret float %b
+  %b = extractelement <vscale x 2 x i64> %a, i32 2
+  ret i64 %b
 }
 
-define half @test_lane0_8xf16(<vscale x 8 x half> %a) {
+define half @test_lane0_8xf16(<vscale x 8 x half> %a) #0 {
 ; CHECK-LABEL: test_lane0_8xf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -72,7 +128,172 @@ define half @test_lane0_8xf16(<vscale x 8 x half> %a) {
   ret half %b
 }
 
-define i8 @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
+define half @test_lane7_8xf16(<vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: test_lane7_8xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[7]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x half> %a, i32 7
+  ret half %b
+}
+
+define half @test_lane8_8xf16(<vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: test_lane8_8xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[8]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x half> %a, i32 8
+  ret half %b
+}
+
+define half @test_lane0_4xf16(<vscale x 4 x half> %a) #0 {
+; CHECK-LABEL: test_lane0_4xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x half> %a, i32 0
+  ret half %b
+}
+
+define half @test_lane3_4xf16(<vscale x 4 x half> %a) #0 {
+; CHECK-LABEL: test_lane3_4xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[3]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x half> %a, i32 3
+  ret half %b
+}
+
+define half @test_lane4_4xf16(<vscale x 4 x half> %a) #0 {
+; CHECK-LABEL: test_lane4_4xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[4]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x half> %a, i32 4
+  ret half %b
+}
+
+define half @test_lane0_2xf16(<vscale x 2 x half> %a) #0 {
+; CHECK-LABEL: test_lane0_2xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x half> %a, i32 0
+  ret half %b
+}
+
+define half @test_lane1_2xf16(<vscale x 2 x half> %a) #0 {
+; CHECK-LABEL: test_lane1_2xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x half> %a, i32 1
+  ret half %b
+}
+
+define half @test_lane2_2xf16(<vscale x 2 x half> %a) #0 {
+; CHECK-LABEL: test_lane2_2xf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[2]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x half> %a, i32 2
+  ret half %b
+}
+
+define float @test_lane0_4xf32(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: test_lane0_4xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x float> %a, i32 0
+  ret float %b
+}
+
+define float @test_lane3_4xf32(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: test_lane3_4xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[3]
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x float> %a, i32 3
+  ret float %b
+}
+
+define float @test_lane4_4xf32(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: test_lane4_4xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[4]
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 4 x float> %a, i32 4
+  ret float %b
+}
+
+define float @test_lane0_2xf32(<vscale x 2 x float> %a) #0 {
+; CHECK-LABEL: test_lane0_2xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x float> %a, i32 0
+  ret float %b
+}
+
+define float @test_lane1_2xf32(<vscale x 2 x float> %a) #0 {
+; CHECK-LABEL: test_lane1_2xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x float> %a, i32 1
+  ret float %b
+}
+
+define float @test_lane2_2xf32(<vscale x 2 x float> %a) #0 {
+; CHECK-LABEL: test_lane2_2xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[2]
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x float> %a, i32 2
+  ret float %b
+}
+
+define double @test_lane0_2xf64(<vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: test_lane0_2xf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x double> %a, i32 0
+  ret double %b
+}
+
+define double @test_lane1_2xf64(<vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: test_lane1_2xf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x double> %a, i32 1
+  ret double %b
+}
+
+define double @test_lane2_2xf64(<vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: test_lane2_2xf64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x double> %a, i32 2
+  ret double %b
+}
+
+define i8 @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) #0 {
 ; CHECK-LABEL: test_lanex_16xi8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
@@ -84,7 +305,7 @@ define i8 @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
   ret i8 %b
 }
 
-define i16 @test_lanex_8xi16(<vscale x 8 x i16> %a, i32 %x) {
+define i16 @test_lanex_8xi16(<vscale x 8 x i16> %a, i32 %x) #0 {
 ; CHECK-LABEL: test_lanex_8xi16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
@@ -96,7 +317,7 @@ define i16 @test_lanex_8xi16(<vscale x 8 x i16> %a, i32 %x) {
   ret i16 %b
 }
 
-define i32 @test_lanex_4xi32(<vscale x 4 x i32> %a, i32 %x) {
+define i32 @test_lanex_4xi32(<vscale x 4 x i32> %a, i32 %x) #0 {
 ; CHECK-LABEL: test_lanex_4xi32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
@@ -108,7 +329,7 @@ define i32 @test_lanex_4xi32(<vscale x 4 x i32> %a, i32 %x) {
   ret i32 %b
 }
 
-define i64 @test_lanex_2xi64(<vscale x 2 x i64> %a, i32 %x) {
+define i64 @test_lanex_2xi64(<vscale x 2 x i64> %a, i32 %x) #0 {
 ; CHECK-LABEL: test_lanex_2xi64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
@@ -120,77 +341,89 @@ define i64 @test_lanex_2xi64(<vscale x 2 x i64> %a, i32 %x) {
   ret i64 %b
 }
 
-define double @test_lanex_2xf64(<vscale x 2 x double> %a, i32 %x) {
-; CHECK-LABEL: test_lanex_2xf64:
+define half @test_lanex_8xf16(<vscale x 8 x half> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_8xf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    whilels p0.d, xzr, x8
-; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    whilels p0.h, xzr, x8
+; CHECK-NEXT:    lastb h0, p0, z0.h
 ; CHECK-NEXT:    ret
-  %b = extractelement <vscale x 2 x double> %a, i32 %x
-  ret double %b
+  %b = extractelement <vscale x 8 x half> %a, i32 %x
+  ret half %b
 }
 
-define float @test_lanex_4xf32(<vscale x 4 x float> %a, i32 %x) {
-; CHECK-LABEL: test_lanex_4xf32:
+define half @test_lanex_4xf16(<vscale x 4 x half> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_4xf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtw x8, w0
 ; CHECK-NEXT:    whilels p0.s, xzr, x8
-; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    lastb h0, p0, z0.h
 ; CHECK-NEXT:    ret
-  %b = extractelement <vscale x 4 x float> %a, i32 %x
-  ret float %b
+  %b = extractelement <vscale x 4 x half> %a, i32 %x
+  ret half %b
 }
 
-define half @test_lanex_8xf16(<vscale x 8 x half> %a, i32 %x) {
-; CHECK-LABEL: test_lanex_8xf16:
+define half @test_lanex_2xf16(<vscale x 2 x half> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_2xf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    whilels p0.h, xzr, x8
+; CHECK-NEXT:    whilels p0.d, xzr, x8
 ; CHECK-NEXT:    lastb h0, p0, z0.h
 ; CHECK-NEXT:    ret
-  %b = extractelement <vscale x 8 x half> %a, i32 %x
+  %b = extractelement <vscale x 2 x half> %a, i32 %x
   ret half %b
 }
 
-; Deliberately choose an index that is out-of-bounds
-define i8 @test_lane64_16xi8(<vscale x 16 x i8> %a) {
-; CHECK-LABEL: test_lane64_16xi8:
+define float @test_lanex_4xf32(<vscale x 4 x float> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_4xf32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64
-; CHECK-NEXT:    whilels p0.b, xzr, x8
-; CHECK-NEXT:    lastb w0, p0, z0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    whilels p0.s, xzr, x8
+; CHECK-NEXT:    lastb s0, p0, z0.s
 ; CHECK-NEXT:    ret
-  %b = extractelement <vscale x 16 x i8> %a, i32 64
-  ret i8 %b
+  %b = extractelement <vscale x 4 x float> %a, i32 %x
+  ret float %b
+}
+
+define float @test_lanex_2xf32(<vscale x 2 x float> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_2xf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    whilels p0.d, xzr, x8
+; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 2 x float> %a, i32 %x
+  ret float %b
 }
 
-define double @test_lane9_2xf64(<vscale x 2 x double> %a) {
-; CHECK-LABEL: test_lane9_2xf64:
+define double @test_lanex_2xf64(<vscale x 2 x double> %a, i32 %x) #0 {
+; CHECK-LABEL: test_lanex_2xf64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #9
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x8, w0
 ; CHECK-NEXT:    whilels p0.d, xzr, x8
 ; CHECK-NEXT:    lastb d0, p0, z0.d
 ; CHECK-NEXT:    ret
-  %b = extractelement <vscale x 2 x double> %a, i32 9
+  %b = extractelement <vscale x 2 x double> %a, i32 %x
   ret double %b
 }
 
 ; Deliberately choose an index that is undefined
-define i32 @test_lane64_4xi32(<vscale x 4 x i32> %a) {
-; CHECK-LABEL: test_lane64_4xi32:
+define i32 @test_undef_lane_4xi32(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: test_undef_lane_4xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 4 x i32> %a, i32 undef
   ret i32 %b
 }
 
-define i8 @extract_of_insert_undef_16xi8(i8 %a) {
+define i8 @extract_of_insert_undef_16xi8(i8 %a) #0 {
 ; CHECK-LABEL: extract_of_insert_undef_16xi8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
@@ -199,7 +432,7 @@ define i8 @extract_of_insert_undef_16xi8(i8 %a) {
   ret i8 %c
 }
 
-define i8 @extract0_of_insert0_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+define i8 @extract0_of_insert0_16xi8(<vscale x 16 x i8> %a, i8 %b) #0 {
 ; CHECK-LABEL: extract0_of_insert0_16xi8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
@@ -208,7 +441,7 @@ define i8 @extract0_of_insert0_16xi8(<vscale x 16 x i8> %a, i8 %b) {
   ret i8 %d
 }
 
-define i8 @extract64_of_insert64_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+define i8 @extract64_of_insert64_16xi8(<vscale x 16 x i8> %a, i8 %b) #0 {
 ; CHECK-LABEL: extract64_of_insert64_16xi8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
@@ -217,18 +450,17 @@ define i8 @extract64_of_insert64_16xi8(<vscale x 16 x i8> %a, i8 %b) {
   ret i8 %d
 }
 
-define i8 @extract_of_insert_
diff _lanes_16xi8(<vscale x 16 x i8> %a, i8 %b) {
+define i8 @extract_of_insert_
diff _lanes_16xi8(<vscale x 16 x i8> %a, i8 %b) #0 {
 ; CHECK-LABEL: extract_of_insert_
diff _lanes_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.b, z0.b[3]
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    umov w0, v0.b[3]
 ; CHECK-NEXT:    ret
   %c = insertelement <vscale x 16 x i8> %a, i8 %b, i32 0
   %d = extractelement <vscale x 16 x i8> %c, i32 3
   ret i8 %d
 }
 
-define i8 @test_lane0_zero_16xi8(<vscale x 16 x i8> %a) {
+define i8 @test_lane0_zero_16xi8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: test_lane0_zero_16xi8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w0, wzr
@@ -240,7 +472,7 @@ define i8 @test_lane0_zero_16xi8(<vscale x 16 x i8> %a) {
 ; The DAG combiner should fold the extract of a splat to give element zero
 ; of the splat, i.e. %x. If the index is beyond the end of the scalable
 ; vector the result is undefined anyway.
-define i64 @test_lanex_splat_2xi64(i64 %x, i32 %y) {
+define i64 @test_lanex_splat_2xi64(i64 %x, i32 %y) #0 {
 ; CHECK-LABEL: test_lanex_splat_2xi64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
@@ -249,3 +481,5 @@ define i64 @test_lanex_splat_2xi64(i64 %x, i32 %y) {
   %c = extractelement <vscale x 2 x i64> %b, i32 %y
   ret i64 %c
 }
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
index a2a4a8e1ba74..cbe4b9391f07 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@@ -182,9 +182,8 @@ define <vscale x 16 x i8> @test_lane0_undef_16xi8(i8 %a) {
 define <vscale x 16 x i8> @test_insert0_of_extract0_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: test_insert0_of_extract0_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.b, b1
-; CHECK-NEXT:    ptrue p0.b, vl1
 ; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    ptrue p0.b, vl1
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    ret
   %c = extractelement <vscale x 16 x i8> %b, i32 0
@@ -212,14 +211,13 @@ define <vscale x 16 x i8> @test_insert64_of_extract64_16xi8(<vscale x 16 x i8> %
 define <vscale x 16 x i8> @test_insert3_of_extract1_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: test_insert3_of_extract1_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.b, z1.b[1]
-; CHECK-NEXT:    mov w8, #3
-; CHECK-NEXT:    index z2.b, #0, #1
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    mov z1.b, w8
+; CHECK-NEXT:    mov w9, #3
+; CHECK-NEXT:    umov w8, v1.b[1]
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    mov z2.b, w9
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z1.b
-; CHECK-NEXT:    mov z0.b, p0/m, w9
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    ret
   %c = extractelement <vscale x 16 x i8> %b, i32 1
   %d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 3

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
index aa01dae05512..75c91f76f965 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
@@ -127,8 +127,7 @@ define i64 @split_extract_8i64_idx(<vscale x 8 x i64> %a, i32 %idx) {
 define i16 @promote_extract_4i16(<vscale x 4 x i16> %a) {
 ; CHECK-LABEL: promote_extract_4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    mov w0, v0.s[1]
 ; CHECK-NEXT:    ret
   %ext = extractelement <vscale x 4 x i16> %a, i32 1
   ret i16 %ext
@@ -137,8 +136,7 @@ define i16 @promote_extract_4i16(<vscale x 4 x i16> %a) {
 define i8 @split_extract_32i8(<vscale x 32 x i8> %a) {
 ; CHECK-LABEL: split_extract_32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.b, z0.b[3]
-; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    umov w0, v0.b[3]
 ; CHECK-NEXT:    ret
   %ext = extractelement <vscale x 32 x i8> %a, i32 3
   ret i8 %ext


        


More information about the llvm-commits mailing list