[llvm] [LLVM][SVE] Extend dup(extract_elt(v,i)) isel patterns to cover all combinations. (PR #115189)

Paul Walker via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 6 10:15:45 PST 2024


https://github.com/paulwalker-arm created https://github.com/llvm/llvm-project/pull/115189

Adds missing bfloat patterns for unpacked scalable vectors.
Adds patterns for splatting extracts from fixed length vectors.

>From fc0443c616bdb2af07c78c075509465bf2ee7476 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 5 Nov 2024 15:18:09 +0000
Subject: [PATCH 1/2] [NFC] Increase test coverage for SVE
 dup(extract_elt(v,i)) isel patterns.

---
 .../AArch64/aarch64-dup-extract-scalable.ll   | 570 +++++++++++++++++-
 1 file changed, 541 insertions(+), 29 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll
index 8c9661730f1f94..f15dde2b327e18 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
 
-define <vscale x 16 x i8> @dup_extract_i8(<vscale x 16 x i8> %data) {
-; CHECK-LABEL: dup_extract_i8:
+define <vscale x 16 x i8> @dup_extract_nxv16i8_nxv16i8(<vscale x 16 x i8> %data) {
+; CHECK-LABEL: dup_extract_nxv16i8_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.b, z0.b[1]
 ; CHECK-NEXT:    ret
@@ -12,8 +12,33 @@ define <vscale x 16 x i8> @dup_extract_i8(<vscale x 16 x i8> %data) {
   ret <vscale x 16 x i8> %.splat
 }
 
-define <vscale x 8 x i16> @dup_extract_i16(<vscale x 8 x i16> %data) {
-; CHECK-LABEL: dup_extract_i16:
+define <vscale x 16 x i8> @dup_extract_nxv16i8_v16i8(<16 x i8> %data) {
+; CHECK-LABEL: dup_extract_nxv16i8_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w8, v0.b[1]
+; CHECK-NEXT:    mov z0.b, w8
+; CHECK-NEXT:    ret
+  %1 = extractelement <16 x i8> %data, i8 1
+  %.splatinsert = insertelement <vscale x 16 x i8> poison, i8 %1, i32 0
+  %.splat = shufflevector <vscale x 16 x i8> %.splatinsert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  ret <vscale x 16 x i8> %.splat
+}
+
+define <vscale x 16 x i8> @dup_extract_nxv16i8_v8i8(<8 x i8> %data) {
+; CHECK-LABEL: dup_extract_nxv16i8_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w8, v0.b[1]
+; CHECK-NEXT:    mov z0.b, w8
+; CHECK-NEXT:    ret
+  %1 = extractelement <8 x i8> %data, i8 1
+  %.splatinsert = insertelement <vscale x 16 x i8> poison, i8 %1, i32 0
+  %.splat = shufflevector <vscale x 16 x i8> %.splatinsert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  ret <vscale x 16 x i8> %.splat
+}
+
+define <vscale x 8 x i16> @dup_extract_nxv8i16_nxv8i16(<vscale x 8 x i16> %data) {
+; CHECK-LABEL: dup_extract_nxv8i16_nxv8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
@@ -23,8 +48,33 @@ define <vscale x 8 x i16> @dup_extract_i16(<vscale x 8 x i16> %data) {
   ret <vscale x 8 x i16> %.splat
 }
 
-define <vscale x 4 x i32> @dup_extract_i32(<vscale x 4 x i32> %data) {
-; CHECK-LABEL: dup_extract_i32:
+define <vscale x 8 x i16> @dup_extract_nxv8i16_v8i16(<8 x i16> %data) {
+; CHECK-LABEL: dup_extract_nxv8i16_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    ret
+  %1 = extractelement <8 x i16> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x i16> poison, i16 %1, i32 0
+  %.splat = shufflevector <vscale x 8 x i16> %.splatinsert, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x i16> %.splat
+}
+
+define <vscale x 8 x i16> @dup_extract_nxv8i16_v4i16(<4 x i16> %data) {
+; CHECK-LABEL: dup_extract_nxv8i16_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x i16> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x i16> poison, i16 %1, i32 0
+  %.splat = shufflevector <vscale x 8 x i16> %.splatinsert, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x i16> %.splat
+}
+
+define <vscale x 4 x i32> @dup_extract_nxv4i32_nxv4i32(<vscale x 4 x i32> %data) {
+; CHECK-LABEL: dup_extract_nxv4i32_nxv4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ret
@@ -34,8 +84,33 @@ define <vscale x 4 x i32> @dup_extract_i32(<vscale x 4 x i32> %data) {
   ret <vscale x 4 x i32> %.splat
 }
 
-define <vscale x 2 x i64> @dup_extract_i64(<vscale x 2 x i64> %data) {
-; CHECK-LABEL: dup_extract_i64:
+define <vscale x 4 x i32> @dup_extract_nxv4i32_v4i32(<4 x i32> %data) {
+; CHECK-LABEL: dup_extract_nxv4i32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x i32> %data, i32 1
+  %.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %1, i32 0
+  %.splat = shufflevector <vscale x 4 x i32> %.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %.splat
+}
+
+define <vscale x 4 x i32> @dup_extract_nxv4i32_v2i32(<2 x i32> %data) {
+; CHECK-LABEL: dup_extract_nxv4i32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ret
+  %1 = extractelement <2 x i32> %data, i32 1
+  %.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %1, i32 0
+  %.splat = shufflevector <vscale x 4 x i32> %.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %.splat
+}
+
+define <vscale x 2 x i64> @dup_extract_nxv2i64_nxv2i64(<vscale x 2 x i64> %data) {
+; CHECK-LABEL: dup_extract_nxv2i64_nxv2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    ret
@@ -45,8 +120,31 @@ define <vscale x 2 x i64> @dup_extract_i64(<vscale x 2 x i64> %data) {
   ret <vscale x 2 x i64> %.splat
 }
 
-define <vscale x 8 x half> @dup_extract_f16(<vscale x 8 x half> %data) {
-; CHECK-LABEL: dup_extract_f16:
+define <vscale x 2 x i64> @dup_extract_nxv2i64_v2i64(<2 x i64> %data) {
+; CHECK-LABEL: dup_extract_nxv2i64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    ret
+  %1 = extractelement <2 x i64> %data, i64 1
+  %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %1, i32 0
+  %.splat = shufflevector <vscale x 2 x i64> %.splatinsert, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %.splat
+}
+
+define <vscale x 2 x i64> @dup_extract_nxv2i64_v1i64(<1 x i64> %data) {
+; CHECK-LABEL: dup_extract_nxv2i64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    ret
+  %1 = extractelement <1 x i64> %data, i64 1
+  %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %1, i32 0
+  %.splat = shufflevector <vscale x 2 x i64> %.splatinsert, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %.splat
+}
+
+define <vscale x 8 x half> @dup_extract_nxv8f16_nxv8f16(<vscale x 8 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv8f16_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
@@ -56,8 +154,69 @@ define <vscale x 8 x half> @dup_extract_f16(<vscale x 8 x half> %data) {
   ret <vscale x 8 x half> %.splat
 }
 
-define <vscale x 4 x half> @dup_extract_f16_4(<vscale x 4 x half> %data) {
-; CHECK-LABEL: dup_extract_f16_4:
+define <vscale x 8 x half> @dup_extract_nxv8f16_nxv4f16(<vscale x 4 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv8f16_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 4 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 8 x half> %.splatinsert, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %.splat
+}
+
+define <vscale x 8 x half> @dup_extract_nxv8f16_nxv2f16(<vscale x 2 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv8f16_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 2 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 8 x half> %.splatinsert, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %.splat
+}
+
+define <vscale x 8 x half> @dup_extract_nxv8f16_v8f16(<8 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv8f16_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <8 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 8 x half> %.splatinsert, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %.splat
+}
+
+define <vscale x 8 x half> @dup_extract_nxv8f16_v4f16(<4 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv8f16_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 8 x half> %.splatinsert, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %.splat
+}
+
+define <vscale x 4 x half> @dup_extract_nxv4f16_nxv8f16(<vscale x 8 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv4f16_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 8 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 4 x half> %.splatinsert, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x half> %.splat
+}
+
+define <vscale x 4 x half> @dup_extract_nxv4f16_nxv4f16(<vscale x 4 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv4f16_nxv4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ret
@@ -67,30 +226,105 @@ define <vscale x 4 x half> @dup_extract_f16_4(<vscale x 4 x half> %data) {
   ret <vscale x 4 x half> %.splat
 }
 
-define <vscale x 2 x half> @dup_extract_f16_2(<vscale x 2 x half> %data) {
-; CHECK-LABEL: dup_extract_f16_2:
+define <vscale x 4 x half> @dup_extract_nxv4f16_nxv2f16(<vscale x 2 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv4f16_nxv2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 2 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 4 x half> %.splatinsert, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x half> %.splat
+}
+
+define <vscale x 4 x half> @dup_extract_nxv4f16_v8f16(<8 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv4f16_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <8 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 4 x half> %.splatinsert, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x half> %.splat
+}
+
+define <vscale x 4 x half> @dup_extract_nxv4f16_v4f16(<4 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv4f16_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 4 x half> %.splatinsert, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x half> %.splat
+}
+
+define <vscale x 2 x half> @dup_extract_nxv2f16_nxv8f16(<vscale x 8 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv2f16_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 8 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x half> poison, half %1, i32 0
   %.splat = shufflevector <vscale x 2 x half> %.splatinsert, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   ret <vscale x 2 x half> %.splat
 }
 
-define <vscale x 8 x bfloat> @dup_extract_bf16(<vscale x 8 x bfloat> %data) #0 {
-; CHECK-LABEL: dup_extract_bf16:
+define <vscale x 2 x half> @dup_extract_nxv2f16_nxv4f16(<vscale x 4 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv2f16_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, z0.h[1]
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
-  %1 = extractelement <vscale x 8 x bfloat> %data, i16 1
-  %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
-  %.splat = shufflevector <vscale x 8 x bfloat> %.splatinsert, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
-  ret <vscale x 8 x bfloat> %.splat
+  %1 = extractelement <vscale x 4 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 2 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 2 x half> %.splatinsert, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x half> %.splat
+}
+
+define <vscale x 2 x half> @dup_extract_nxv2f16_nxv2f16(<vscale x 2 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv2f16_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 2 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 2 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 2 x half> %.splatinsert, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x half> %.splat
+}
+
+define <vscale x 2 x half> @dup_extract_nxv2f16_v8f16(<8 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv2f16_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <8 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 2 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 2 x half> %.splatinsert, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x half> %.splat
+}
+
+define <vscale x 2 x half> @dup_extract_nxv2f16_v4f16(<4 x half> %data) {
+; CHECK-LABEL: dup_extract_nxv2f16_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x half> %data, i16 1
+  %.splatinsert = insertelement <vscale x 2 x half> poison, half %1, i32 0
+  %.splat = shufflevector <vscale x 2 x half> %.splatinsert, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x half> %.splat
 }
 
-define <vscale x 4 x float> @dup_extract_f32(<vscale x 4 x float> %data) {
-; CHECK-LABEL: dup_extract_f32:
+define <vscale x 4 x float> @dup_extract_nxv4f32_nxv4f32(<vscale x 4 x float> %data) {
+; CHECK-LABEL: dup_extract_nxv4f32_nxv4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ret
@@ -100,19 +334,93 @@ define <vscale x 4 x float> @dup_extract_f32(<vscale x 4 x float> %data) {
   ret <vscale x 4 x float> %.splat
 }
 
-define <vscale x 2 x float> @dup_extract_f32_2(<vscale x 2 x float> %data) {
-; CHECK-LABEL: dup_extract_f32_2:
+define <vscale x 4 x float> @dup_extract_nxv4f32_nxv2f32(<vscale x 2 x float> %data) {
+; CHECK-LABEL: dup_extract_nxv4f32_nxv2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 2 x float> %data, i32 1
+  %.splatinsert = insertelement <vscale x 4 x float> poison, float %1, i32 0
+  %.splat = shufflevector <vscale x 4 x float> %.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %.splat
+}
+
+define <vscale x 4 x float> @dup_extract_nxv4f32_v4f32(<4 x float> %data) {
+; CHECK-LABEL: dup_extract_nxv4f32_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x float> %data, i32 1
+  %.splatinsert = insertelement <vscale x 4 x float> poison, float %1, i32 0
+  %.splat = shufflevector <vscale x 4 x float> %.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %.splat
+}
+
+define <vscale x 4 x float> @dup_extract_nxv4f32_v2f32(<2 x float> %data) {
+; CHECK-LABEL: dup_extract_nxv4f32_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
+  %1 = extractelement <2 x float> %data, i32 1
+  %.splatinsert = insertelement <vscale x 4 x float> poison, float %1, i32 0
+  %.splat = shufflevector <vscale x 4 x float> %.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %.splat
+}
+
+define <vscale x 2 x float> @dup_extract_nxv2f32_nxv4f32(<vscale x 4 x float> %data) {
+; CHECK-LABEL: dup_extract_nxv2f32_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 4 x float> %data, i32 1
   %.splatinsert = insertelement <vscale x 2 x float> poison, float %1, i32 0
   %.splat = shufflevector <vscale x 2 x float> %.splatinsert, <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
   ret <vscale x 2 x float> %.splat
 }
 
-define <vscale x 2 x double> @dup_extract_f64(<vscale x 2 x double> %data) {
-; CHECK-LABEL: dup_extract_f64:
+define <vscale x 2 x float> @dup_extract_nxv2f32_nxv2f32(<vscale x 2 x float> %data) {
+; CHECK-LABEL: dup_extract_nxv2f32_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 2 x float> %data, i32 1
+  %.splatinsert = insertelement <vscale x 2 x float> poison, float %1, i32 0
+  %.splat = shufflevector <vscale x 2 x float> %.splatinsert, <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %.splat
+}
+
+define <vscale x 2 x float> @dup_extract_nxv2f32_v4f32(<4 x float> %data) {
+; CHECK-LABEL: dup_extract_nxv2f32_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x float> %data, i32 1
+  %.splatinsert = insertelement <vscale x 2 x float> poison, float %1, i32 0
+  %.splat = shufflevector <vscale x 2 x float> %.splatinsert, <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %.splat
+}
+
+define <vscale x 2 x float> @dup_extract_nxv2f32_v2f32(<2 x float> %data) {
+; CHECK-LABEL: dup_extract_nxv2f32_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
+  %1 = extractelement <2 x float> %data, i32 1
+  %.splatinsert = insertelement <vscale x 2 x float> poison, float %1, i32 0
+  %.splat = shufflevector <vscale x 2 x float> %.splatinsert, <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %.splat
+}
+
+define <vscale x 2 x double> @dup_extract_nxv2f64_nxv2f64(<vscale x 2 x double> %data) {
+; CHECK-LABEL: dup_extract_nxv2f64_nxv2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    ret
@@ -122,5 +430,209 @@ define <vscale x 2 x double> @dup_extract_f64(<vscale x 2 x double> %data) {
   ret <vscale x 2 x double> %.splat
 }
 
-; +bf16 is required for the bfloat version.
-attributes #0 = { "target-features"="+sve,+bf16" }
+define <vscale x 2 x double> @dup_extract_nxv2f64_v2f64(<2 x double> %data) {
+; CHECK-LABEL: dup_extract_nxv2f64_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov d0, v0.d[1]
+; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    ret
+  %1 = extractelement <2 x double> %data, i64 1
+  %.splatinsert = insertelement <vscale x 2 x double> poison, double %1, i32 0
+  %.splat = shufflevector <vscale x 2 x double> %.splatinsert, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %.splat
+}
+
+define <vscale x 2 x double> @dup_extract_nxv2f64_v1f64(<1 x double> %data) {
+; CHECK-LABEL: dup_extract_nxv2f64_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    ret
+  %1 = extractelement <1 x double> %data, i64 1
+  %.splatinsert = insertelement <vscale x 2 x double> poison, double %1, i32 0
+  %.splat = shufflevector <vscale x 2 x double> %.splatinsert, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %.splat
+}
+
+define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv8bf16_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[1]
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 8 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 8 x bfloat> %.splatinsert, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x bfloat> %.splat
+}
+
+define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_nxv4bf16(<vscale x 4 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv8bf16_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 4 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 8 x bfloat> %.splatinsert, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x bfloat> %.splat
+}
+
+define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_nxv2bf16(<vscale x 2 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv8bf16_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 2 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 8 x bfloat> %.splatinsert, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x bfloat> %.splat
+}
+
+define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_v8bf16(<8 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv8bf16_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <8 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 8 x bfloat> %.splatinsert, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x bfloat> %.splat
+}
+
+define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_v4bf16(<4 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv8bf16_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 8 x bfloat> %.splatinsert, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x bfloat> %.splat
+}
+
+define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_nxv8bf16(<vscale x 8 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv4bf16_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 8 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 4 x bfloat> %.splatinsert, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x bfloat> %.splat
+}
+
+define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv4bf16_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 4 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 4 x bfloat> %.splatinsert, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x bfloat> %.splat
+}
+
+define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_nxv2bf16(<vscale x 2 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv4bf16_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 2 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 4 x bfloat> %.splatinsert, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x bfloat> %.splat
+}
+
+define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_v8bf16(<8 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv4bf16_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <8 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 4 x bfloat> %.splatinsert, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x bfloat> %.splat
+}
+
+define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_v4bf16(<4 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv4bf16_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 4 x bfloat> %.splatinsert, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x bfloat> %.splat
+}
+
+define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_nxv8bf16(<vscale x 8 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv2bf16_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 8 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 2 x bfloat> %.splatinsert, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x bfloat> %.splat
+}
+
+define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv2bf16_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 4 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 2 x bfloat> %.splatinsert, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x bfloat> %.splat
+}
+
+define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv2bf16_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <vscale x 2 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 2 x bfloat> %.splatinsert, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x bfloat> %.splat
+}
+
+define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_v8bf16(<8 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv2bf16_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <8 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 2 x bfloat> %.splatinsert, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x bfloat> %.splat
+}
+
+define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_v4bf16(<4 x bfloat> %data) {
+; CHECK-LABEL: dup_extract_nxv2bf16_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
+  %1 = extractelement <4 x bfloat> %data, i16 1
+  %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0
+  %.splat = shufflevector <vscale x 2 x bfloat> %.splatinsert, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x bfloat> %.splat
+}
+
+attributes #0 = { "target-features"="+sve" }

>From 51a2974bb6f1e086fa3897dce4cb80790aa8281e Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 5 Nov 2024 15:18:38 +0000
Subject: [PATCH 2/2] [LLVM][SVE] Extend dup(extract_elt(v,i)) isel patterns to
 cover all combinations.

Adds missing bfloat patterns for unpacked scalable vectors.
Adds patterns for splatting extracts from fixed length vectors.
---
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 129 +++++++++++++++---
 .../AArch64/aarch64-dup-extract-scalable.ll   | 123 +++++++----------
 2 files changed, 157 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 5cfcc01afd20f3..f542c7a34ad60e 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -59,6 +59,57 @@ class SVEType<ValueType VT> {
     !eq(VT, nxv8f16): nxv2f16,
     !eq(VT, nxv8bf16): nxv2bf16,
     true : untyped);
+
+  // The 64-bit vector subreg of VT.
+  ValueType DSub = !cond(
+    !eq(VT, nxv16i8): v8i8,
+    !eq(VT, nxv8i16): v4i16,
+    !eq(VT, nxv4i32): v2i32,
+    !eq(VT, nxv2i64): v1i64,
+    !eq(VT, nxv2f16): v4f16,
+    !eq(VT, nxv4f16): v4f16,
+    !eq(VT, nxv8f16): v4f16,
+    !eq(VT, nxv2f32): v2f32,
+    !eq(VT, nxv4f32): v2f32,
+    !eq(VT, nxv2f64): v1f64,
+    !eq(VT, nxv2bf16): v4bf16,
+    !eq(VT, nxv4bf16): v4bf16,
+    !eq(VT, nxv8bf16): v4bf16,
+    true : untyped);
+
+    // The 128-bit vector subreg of VT.
+  ValueType ZSub = !cond(
+    !eq(VT, nxv16i8): v16i8,
+    !eq(VT, nxv8i16): v8i16,
+    !eq(VT, nxv4i32): v4i32,
+    !eq(VT, nxv2i64): v2i64,
+    !eq(VT, nxv2f16): v8f16,
+    !eq(VT, nxv4f16): v8f16,
+    !eq(VT, nxv8f16): v8f16,
+    !eq(VT, nxv2f32): v4f32,
+    !eq(VT, nxv4f32): v4f32,
+    !eq(VT, nxv2f64): v2f64,
+    !eq(VT, nxv2bf16): v8bf16,
+    !eq(VT, nxv4bf16): v8bf16,
+    !eq(VT, nxv8bf16): v8bf16,
+    true : untyped);
+
+  // The legal scalar used to hold a vector element.
+  ValueType EltAsScalar = !cond(
+    !eq(VT, nxv16i8): i32,
+    !eq(VT, nxv8i16): i32,
+    !eq(VT, nxv4i32): i32,
+    !eq(VT, nxv2i64): i64,
+    !eq(VT, nxv2f16): f16,
+    !eq(VT, nxv4f16): f16,
+    !eq(VT, nxv8f16): f16,
+    !eq(VT, nxv2f32): f32,
+    !eq(VT, nxv4f32): f32,
+    !eq(VT, nxv2f64): f64,
+    !eq(VT, nxv2bf16): bf16,
+    !eq(VT, nxv4bf16): bf16,
+    !eq(VT, nxv8bf16): bf16,
+    true : untyped);
 }
 
 def SDT_AArch64Setcc : SDTypeProfile<1, 4, [
@@ -1402,29 +1453,67 @@ multiclass sve_int_perm_dup_i<string asm> {
   def : InstAlias<"mov $Zd, $Qn",
                   (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
 
-  // Duplicate extracted element of vector into all vector elements
+  // Duplicate an extracted vector element across a vector.
+
   def : Pat<(nxv16i8 (splat_vector (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))),
             (!cast<Instruction>(NAME # _B) ZPR:$vec, sve_elm_idx_extdup_b:$index)>;
-  def : Pat<(nxv8i16 (splat_vector (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
-            (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
-  def : Pat<(nxv4i32 (splat_vector (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
-            (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
-  def : Pat<(nxv2i64 (splat_vector (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
-            (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
-  def : Pat<(nxv8f16 (splat_vector (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
-            (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
-  def : Pat<(nxv8bf16 (splat_vector (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
-            (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
-  def : Pat<(nxv4f16 (splat_vector (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
-            (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
-  def : Pat<(nxv2f16 (splat_vector (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
-            (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
-  def : Pat<(nxv4f32 (splat_vector (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
-            (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
-  def : Pat<(nxv2f32 (splat_vector (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
-            (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
-  def : Pat<(nxv2f64 (splat_vector (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+  def : Pat<(nxv16i8 (splat_vector (i32 (vector_extract (v16i8 V128:$vec), sve_elm_idx_extdup_b:$index)))),
+            (!cast<Instruction>(NAME # _B) (SUBREG_TO_REG (i64 0), $vec, zsub), sve_elm_idx_extdup_b:$index)>;
+  def : Pat<(nxv16i8 (splat_vector (i32 (vector_extract (v8i8 V64:$vec), sve_elm_idx_extdup_b:$index)))),
+            (!cast<Instruction>(NAME # _B) (SUBREG_TO_REG (i64 0), $vec, dsub), sve_elm_idx_extdup_b:$index)>;
+
+  foreach VT = [nxv8i16, nxv2f16, nxv4f16, nxv8f16, nxv2bf16, nxv4bf16, nxv8bf16] in {
+    def : Pat<(VT (splat_vector (SVEType<VT>.EltAsScalar (vector_extract (SVEType<VT>.Packed ZPR:$vec), sve_elm_idx_extdup_h:$index)))),
+              (!cast<Instruction>(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>;
+    def : Pat<(VT (splat_vector (SVEType<VT>.EltAsScalar (vector_extract (SVEType<VT>.ZSub V128:$vec), sve_elm_idx_extdup_h:$index)))),
+              (!cast<Instruction>(NAME # _H) (SUBREG_TO_REG (i64 0), $vec, zsub), sve_elm_idx_extdup_h:$index)>;
+    def : Pat<(VT (splat_vector (SVEType<VT>.EltAsScalar (vector_extract (SVEType<VT>.DSub V64:$vec), sve_elm_idx_extdup_h:$index)))),
+              (!cast<Instruction>(NAME # _H) (SUBREG_TO_REG (i64 0), $vec, dsub), sve_elm_idx_extdup_h:$index)>;
+  }
+
+  foreach VT = [nxv4i32, nxv2f32, nxv4f32 ] in {
+    def : Pat<(VT (splat_vector (SVEType<VT>.EltAsScalar (vector_extract (SVEType<VT>.Packed ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+              (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
+    def : Pat<(VT (splat_vector (SVEType<VT>.EltAsScalar (vector_extract (SVEType<VT>.ZSub V128:$vec), sve_elm_idx_extdup_s:$index)))),
+              (!cast<Instruction>(NAME # _S) (SUBREG_TO_REG (i64 0), $vec, zsub), sve_elm_idx_extdup_s:$index)>;
+    def : Pat<(VT (splat_vector (SVEType<VT>.EltAsScalar (vector_extract (SVEType<VT>.DSub V64:$vec), sve_elm_idx_extdup_s:$index)))),
+              (!cast<Instruction>(NAME # _S) (SUBREG_TO_REG (i64 0), $vec, dsub), sve_elm_idx_extdup_s:$index)>;
+  }
+
+  foreach VT = [nxv2i64, nxv2f64] in {
+    def : Pat<(VT (splat_vector (SVEType<VT>.EltAsScalar (vector_extract (VT ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+              (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+    def : Pat<(VT (splat_vector (SVEType<VT>.EltAsScalar (vector_extract (SVEType<VT>.ZSub V128:$vec), sve_elm_idx_extdup_d:$index)))),
+              (!cast<Instruction>(NAME # _D) (SUBREG_TO_REG (i64 0), $vec, zsub), sve_elm_idx_extdup_d:$index)>;
+    def : Pat<(VT (splat_vector (SVEType<VT>.EltAsScalar (vector_extract (SVEType<VT>.DSub V64:$vec), sve_elm_idx_extdup_d:$index)))),
+              (!cast<Instruction>(NAME # _D) (SUBREG_TO_REG (i64 0), $vec, dsub), sve_elm_idx_extdup_d:$index)>;
+  }
+
+  // When extracting from an unpacked vector the index must be scaled to account
+  // for the "holes" in the underlying packed vector type. We get the scaling
+  // for free by "promoting" the element type to one whose underlying vector type
+  // is packed.
+
+  foreach VT = [nxv2f16, nxv4f16, nxv8f16] in {
+    def : Pat<(VT (splat_vector (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+              (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
+    def : Pat<(VT (splat_vector (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+              (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+  }
+
+  foreach VT = [nxv2bf16, nxv4bf16, nxv8bf16] in {
+    def : Pat<(VT (splat_vector (bf16 (vector_extract (nxv4bf16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))),
+              (!cast<Instruction>(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>;
+    def : Pat<(VT (splat_vector (bf16 (vector_extract (nxv2bf16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
+              (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+  }
+
+  foreach VT = [nxv2f32, nxv4f32] in {
+    def : Pat<(VT (splat_vector (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))),
             (!cast<Instruction>(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>;
+  }
+
+  // Duplicate an indexed 128-bit segment across a vector.
 
   def : Pat<(nxv16i8 (AArch64duplane128 nxv16i8:$Op1, i64:$imm)),
             (!cast<Instruction>(NAME # _Q) $Op1, $imm)>;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll
index f15dde2b327e18..0cf8aec52fe258 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll
@@ -15,8 +15,8 @@ define <vscale x 16 x i8> @dup_extract_nxv16i8_nxv16i8(<vscale x 16 x i8> %data)
 define <vscale x 16 x i8> @dup_extract_nxv16i8_v16i8(<16 x i8> %data) {
 ; CHECK-LABEL: dup_extract_nxv16i8_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umov w8, v0.b[1]
-; CHECK-NEXT:    mov z0.b, w8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.b, z0.b[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <16 x i8> %data, i8 1
   %.splatinsert = insertelement <vscale x 16 x i8> poison, i8 %1, i32 0
@@ -27,9 +27,8 @@ define <vscale x 16 x i8> @dup_extract_nxv16i8_v16i8(<16 x i8> %data) {
 define <vscale x 16 x i8> @dup_extract_nxv16i8_v8i8(<8 x i8> %data) {
 ; CHECK-LABEL: dup_extract_nxv16i8_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.b[1]
-; CHECK-NEXT:    mov z0.b, w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.b, z0.b[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <8 x i8> %data, i8 1
   %.splatinsert = insertelement <vscale x 16 x i8> poison, i8 %1, i32 0
@@ -51,8 +50,8 @@ define <vscale x 8 x i16> @dup_extract_nxv8i16_nxv8i16(<vscale x 8 x i16> %data)
 define <vscale x 8 x i16> @dup_extract_nxv8i16_v8i16(<8 x i16> %data) {
 ; CHECK-LABEL: dup_extract_nxv8i16_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <8 x i16> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x i16> poison, i16 %1, i32 0
@@ -63,9 +62,8 @@ define <vscale x 8 x i16> @dup_extract_nxv8i16_v8i16(<8 x i16> %data) {
 define <vscale x 8 x i16> @dup_extract_nxv8i16_v4i16(<4 x i16> %data) {
 ; CHECK-LABEL: dup_extract_nxv8i16_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.h[1]
-; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x i16> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x i16> poison, i16 %1, i32 0
@@ -87,8 +85,8 @@ define <vscale x 4 x i32> @dup_extract_nxv4i32_nxv4i32(<vscale x 4 x i32> %data)
 define <vscale x 4 x i32> @dup_extract_nxv4i32_v4i32(<4 x i32> %data) {
 ; CHECK-LABEL: dup_extract_nxv4i32_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x i32> %data, i32 1
   %.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %1, i32 0
@@ -99,9 +97,8 @@ define <vscale x 4 x i32> @dup_extract_nxv4i32_v4i32(<4 x i32> %data) {
 define <vscale x 4 x i32> @dup_extract_nxv4i32_v2i32(<2 x i32> %data) {
 ; CHECK-LABEL: dup_extract_nxv4i32_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <2 x i32> %data, i32 1
   %.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %1, i32 0
@@ -123,8 +120,8 @@ define <vscale x 2 x i64> @dup_extract_nxv2i64_nxv2i64(<vscale x 2 x i64> %data)
 define <vscale x 2 x i64> @dup_extract_nxv2i64_v2i64(<2 x i64> %data) {
 ; CHECK-LABEL: dup_extract_nxv2i64_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <2 x i64> %data, i64 1
   %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %1, i32 0
@@ -158,7 +155,6 @@ define <vscale x 8 x half> @dup_extract_nxv8f16_nxv4f16(<vscale x 4 x half> %dat
 ; CHECK-LABEL: dup_extract_nxv8f16_nxv4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 4 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
@@ -170,7 +166,6 @@ define <vscale x 8 x half> @dup_extract_nxv8f16_nxv2f16(<vscale x 2 x half> %dat
 ; CHECK-LABEL: dup_extract_nxv8f16_nxv2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 2 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
@@ -181,8 +176,8 @@ define <vscale x 8 x half> @dup_extract_nxv8f16_nxv2f16(<vscale x 2 x half> %dat
 define <vscale x 8 x half> @dup_extract_nxv8f16_v8f16(<8 x half> %data) {
 ; CHECK-LABEL: dup_extract_nxv8f16_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <8 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
@@ -193,9 +188,8 @@ define <vscale x 8 x half> @dup_extract_nxv8f16_v8f16(<8 x half> %data) {
 define <vscale x 8 x half> @dup_extract_nxv8f16_v4f16(<4 x half> %data) {
 ; CHECK-LABEL: dup_extract_nxv8f16_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x half> poison, half %1, i32 0
@@ -207,7 +201,6 @@ define <vscale x 4 x half> @dup_extract_nxv4f16_nxv8f16(<vscale x 8 x half> %dat
 ; CHECK-LABEL: dup_extract_nxv4f16_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 8 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
@@ -230,7 +223,6 @@ define <vscale x 4 x half> @dup_extract_nxv4f16_nxv2f16(<vscale x 2 x half> %dat
 ; CHECK-LABEL: dup_extract_nxv4f16_nxv2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 2 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
@@ -241,8 +233,8 @@ define <vscale x 4 x half> @dup_extract_nxv4f16_nxv2f16(<vscale x 2 x half> %dat
 define <vscale x 4 x half> @dup_extract_nxv4f16_v8f16(<8 x half> %data) {
 ; CHECK-LABEL: dup_extract_nxv4f16_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <8 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
@@ -253,9 +245,8 @@ define <vscale x 4 x half> @dup_extract_nxv4f16_v8f16(<8 x half> %data) {
 define <vscale x 4 x half> @dup_extract_nxv4f16_v4f16(<4 x half> %data) {
 ; CHECK-LABEL: dup_extract_nxv4f16_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 4 x half> poison, half %1, i32 0
@@ -267,7 +258,6 @@ define <vscale x 2 x half> @dup_extract_nxv2f16_nxv8f16(<vscale x 8 x half> %dat
 ; CHECK-LABEL: dup_extract_nxv2f16_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 8 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x half> poison, half %1, i32 0
@@ -279,7 +269,6 @@ define <vscale x 2 x half> @dup_extract_nxv2f16_nxv4f16(<vscale x 4 x half> %dat
 ; CHECK-LABEL: dup_extract_nxv2f16_nxv4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 4 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x half> poison, half %1, i32 0
@@ -301,8 +290,8 @@ define <vscale x 2 x half> @dup_extract_nxv2f16_nxv2f16(<vscale x 2 x half> %dat
 define <vscale x 2 x half> @dup_extract_nxv2f16_v8f16(<8 x half> %data) {
 ; CHECK-LABEL: dup_extract_nxv2f16_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <8 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x half> poison, half %1, i32 0
@@ -313,9 +302,8 @@ define <vscale x 2 x half> @dup_extract_nxv2f16_v8f16(<8 x half> %data) {
 define <vscale x 2 x half> @dup_extract_nxv2f16_v4f16(<4 x half> %data) {
 ; CHECK-LABEL: dup_extract_nxv2f16_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x half> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x half> poison, half %1, i32 0
@@ -338,7 +326,6 @@ define <vscale x 4 x float> @dup_extract_nxv4f32_nxv2f32(<vscale x 2 x float> %d
 ; CHECK-LABEL: dup_extract_nxv4f32_nxv2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 2 x float> %data, i32 1
   %.splatinsert = insertelement <vscale x 4 x float> poison, float %1, i32 0
@@ -349,8 +336,8 @@ define <vscale x 4 x float> @dup_extract_nxv4f32_nxv2f32(<vscale x 2 x float> %d
 define <vscale x 4 x float> @dup_extract_nxv4f32_v4f32(<4 x float> %data) {
 ; CHECK-LABEL: dup_extract_nxv4f32_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x float> %data, i32 1
   %.splatinsert = insertelement <vscale x 4 x float> poison, float %1, i32 0
@@ -361,9 +348,8 @@ define <vscale x 4 x float> @dup_extract_nxv4f32_v4f32(<4 x float> %data) {
 define <vscale x 4 x float> @dup_extract_nxv4f32_v2f32(<2 x float> %data) {
 ; CHECK-LABEL: dup_extract_nxv4f32_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <2 x float> %data, i32 1
   %.splatinsert = insertelement <vscale x 4 x float> poison, float %1, i32 0
@@ -375,7 +361,6 @@ define <vscale x 2 x float> @dup_extract_nxv2f32_nxv4f32(<vscale x 4 x float> %d
 ; CHECK-LABEL: dup_extract_nxv2f32_nxv4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 4 x float> %data, i32 1
   %.splatinsert = insertelement <vscale x 2 x float> poison, float %1, i32 0
@@ -397,8 +382,8 @@ define <vscale x 2 x float> @dup_extract_nxv2f32_nxv2f32(<vscale x 2 x float> %d
 define <vscale x 2 x float> @dup_extract_nxv2f32_v4f32(<4 x float> %data) {
 ; CHECK-LABEL: dup_extract_nxv2f32_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x float> %data, i32 1
   %.splatinsert = insertelement <vscale x 2 x float> poison, float %1, i32 0
@@ -409,9 +394,8 @@ define <vscale x 2 x float> @dup_extract_nxv2f32_v4f32(<4 x float> %data) {
 define <vscale x 2 x float> @dup_extract_nxv2f32_v2f32(<2 x float> %data) {
 ; CHECK-LABEL: dup_extract_nxv2f32_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <2 x float> %data, i32 1
   %.splatinsert = insertelement <vscale x 2 x float> poison, float %1, i32 0
@@ -433,8 +417,8 @@ define <vscale x 2 x double> @dup_extract_nxv2f64_nxv2f64(<vscale x 2 x double>
 define <vscale x 2 x double> @dup_extract_nxv2f64_v2f64(<2 x double> %data) {
 ; CHECK-LABEL: dup_extract_nxv2f64_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d0, v0.d[1]
-; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <2 x double> %data, i64 1
   %.splatinsert = insertelement <vscale x 2 x double> poison, double %1, i32 0
@@ -468,7 +452,6 @@ define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_nxv4bf16(<vscale x 4 x bfloat
 ; CHECK-LABEL: dup_extract_nxv8bf16_nxv4bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 4 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
@@ -480,7 +463,6 @@ define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_nxv2bf16(<vscale x 2 x bfloat
 ; CHECK-LABEL: dup_extract_nxv8bf16_nxv2bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 2 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
@@ -491,8 +473,8 @@ define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_nxv2bf16(<vscale x 2 x bfloat
 define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_v8bf16(<8 x bfloat> %data) {
 ; CHECK-LABEL: dup_extract_nxv8bf16_v8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <8 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
@@ -503,9 +485,8 @@ define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_v8bf16(<8 x bfloat> %data) {
 define <vscale x 8 x bfloat> @dup_extract_nxv8bf16_v4bf16(<4 x bfloat> %data) {
 ; CHECK-LABEL: dup_extract_nxv8bf16_v4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 8 x bfloat> poison, bfloat %1, i32 0
@@ -517,7 +498,6 @@ define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_nxv8bf16(<vscale x 8 x bfloat
 ; CHECK-LABEL: dup_extract_nxv4bf16_nxv8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 8 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
@@ -529,7 +509,6 @@ define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat
 ; CHECK-LABEL: dup_extract_nxv4bf16_nxv4bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 4 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
@@ -541,7 +520,6 @@ define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_nxv2bf16(<vscale x 2 x bfloat
 ; CHECK-LABEL: dup_extract_nxv4bf16_nxv2bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 2 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
@@ -552,8 +530,8 @@ define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_nxv2bf16(<vscale x 2 x bfloat
 define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_v8bf16(<8 x bfloat> %data) {
 ; CHECK-LABEL: dup_extract_nxv4bf16_v8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <8 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
@@ -564,9 +542,8 @@ define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_v8bf16(<8 x bfloat> %data) {
 define <vscale x 4 x bfloat> @dup_extract_nxv4bf16_v4bf16(<4 x bfloat> %data) {
 ; CHECK-LABEL: dup_extract_nxv4bf16_v4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 4 x bfloat> poison, bfloat %1, i32 0
@@ -578,7 +555,6 @@ define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_nxv8bf16(<vscale x 8 x bfloat
 ; CHECK-LABEL: dup_extract_nxv2bf16_nxv8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 8 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0
@@ -590,7 +566,6 @@ define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat
 ; CHECK-LABEL: dup_extract_nxv2bf16_nxv4bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 4 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0
@@ -602,7 +577,6 @@ define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat
 ; CHECK-LABEL: dup_extract_nxv2bf16_nxv2bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    ret
   %1 = extractelement <vscale x 2 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0
@@ -613,8 +587,8 @@ define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat
 define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_v8bf16(<8 x bfloat> %data) {
 ; CHECK-LABEL: dup_extract_nxv2bf16_v8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <8 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0
@@ -625,9 +599,8 @@ define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_v8bf16(<8 x bfloat> %data) {
 define <vscale x 2 x bfloat> @dup_extract_nxv2bf16_v4bf16(<4 x bfloat> %data) {
 ; CHECK-LABEL: dup_extract_nxv2bf16_v4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    ret
   %1 = extractelement <4 x bfloat> %data, i16 1
   %.splatinsert = insertelement <vscale x 2 x bfloat> poison, bfloat %1, i32 0



More information about the llvm-commits mailing list