[llvm] 0af1808 - [AArch64][SVE] Fix VLS mulh tests to use literal vectors

Fri Jan 14 06:23:22 PST 2022

Author: David Truby
Date: 2022-01-14T14:21:36Z
New Revision: 0af1808f9b99b49b87b8503466110baee42c5aea

URL: https://github.com/llvm/llvm-project/commit/0af1808f9b99b49b87b8503466110baee42c5aea
DIFF: https://github.com/llvm/llvm-project/commit/0af1808f9b99b49b87b8503466110baee42c5aea.diff

LOG: [AArch64][SVE] Fix VLS mulh tests to use literal vectors

This is a simple test fix that canonicalises the SVE mulh tests.

Added: 
    

Modified: 
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index 063792242c871..42e103694e051 100644

--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -1,11 +1,11 @@
 ; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
-; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_EQ_256
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
 ; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
-; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
 ; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
 ; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
 ; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
 ; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
 ; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
 ; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
@@ -61,7 +61,7 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
   %1 = sext <8 x i8> %op1 to <8 x i16>
   %2 = sext <8 x i8> %op2 to <8 x i16>
   %mul = mul <8 x i16> %1, %2
-  %shr = lshr <8 x i16> %mul, %splat
+  %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <8 x i16> %shr to <8 x i8>
   ret <8 x i8> %res
 }
@@ -74,69 +74,43 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %insert = insertelement <16 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   %1 = sext <16 x i8> %op1 to <16 x i16>
   %2 = sext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
-  %shr = lshr <16 x i16> %mul, %splat
+  %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <16 x i16> %shr to <16 x i8>
   ret <16 x i8> %res
 }
 
 define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_256-LABEL: smulh_v32i8:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.b, vl32
-; VBITS_EQ_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_256-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: smulh_v32i8:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1sb { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1sb { z1.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_512-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_512-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: smulh_v32i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
-  %insert = insertelement <32 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
   %1 = sext <32 x i8> %op1 to <32 x i16>
   %2 = sext <32 x i8> %op2 to <32 x i16>
   %mul = mul <32 x i16> %1, %2
-  %shr = lshr <32 x i16> %mul, %splat
+  %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <32 x i16> %shr to <32 x i8>
   store <32 x i8> %res, <32 x i8>* %a
   ret void
 }
 
 define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_512-LABEL: smulh_v64i8:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.b, vl64
-; VBITS_EQ_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_512-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v64i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1sb { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1sb { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_1024-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: smulh_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
   %insert = insertelement <64 x i16> undef, i16 8, i64 0
@@ -144,40 +118,28 @@ define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
   %1 = sext <64 x i8> %op1 to <64 x i16>
   %2 = sext <64 x i8> %op2 to <64 x i16>
   %mul = mul <64 x i16> %1, %2
-  %shr = lshr <64 x i16> %mul, %splat
+  %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <64 x i16> %shr to <64 x i8>
   store <64 x i8> %res, <64 x i8>* %a
   ret void
 }
 
 define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_1024-LABEL: smulh_v128i8:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_EQ_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v128i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1sb { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sb { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_2048-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+; VBITS_GE_1024-LABEL: smulh_v128i8:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
+; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
 
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
-  %insert = insertelement <128 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
   %1 = sext <128 x i8> %op1 to <128 x i16>
   %2 = sext <128 x i8> %op2 to <128 x i16>
   %mul = mul <128 x i16> %1, %2
-  %shr = lshr <128 x i16> %mul, %splat
+  %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <128 x i16> %shr to <128 x i8>
   store <128 x i8> %res, <128 x i8>* %a
   ret void
@@ -194,12 +156,10 @@ define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 ; VBITS_GE_2048-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
-  %insert = insertelement <256 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
   %1 = sext <256 x i8> %op1 to <256 x i16>
   %2 = sext <256 x i8> %op2 to <256 x i16>
   %mul = mul <256 x i16> %1, %2
-  %shr = lshr <256 x i16> %mul, %splat
+  %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <256 x i16> %shr to <256 x i8>
   store <256 x i8> %res, <256 x i8>* %a
   ret void
@@ -220,12 +180,10 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-NEXT:    mov v0.h[3], w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
-  %insert = insertelement <4 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
   %1 = sext <4 x i16> %op1 to <4 x i32>
   %2 = sext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
-  %shr = lshr <4 x i32> %mul, %splat
+  %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
   %res = trunc <4 x i32> %shr to <4 x i16>
   ret <4 x i16> %res
 }
@@ -238,110 +196,69 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
-  %insert = insertelement <8 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   %1 = sext <8 x i16> %op1 to <8 x i32>
   %2 = sext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
-  %shr = lshr <8 x i32> %mul, %splat
+  %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <8 x i32> %shr to <8 x i16>
   ret <8 x i16> %res
 }
 
 define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_256-LABEL: smulh_v16i16:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: smulh_v16i16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1sh { z1.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_512-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_512-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: smulh_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
-  %insert = insertelement <16 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
   %1 = sext <16 x i16> %op1 to <16 x i32>
   %2 = sext <16 x i16> %op2 to <16 x i32>
   %mul = mul <16 x i32> %1, %2
-  %shr = lshr <16 x i32> %mul, %splat
+  %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <16 x i32> %shr to <16 x i16>
   store <16 x i16> %res, <16 x i16>* %a
   ret void
 }
 
 define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_512-LABEL: smulh_v32i16:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.h, vl32
-; VBITS_EQ_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_512-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v32i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1sh { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_1024-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: smulh_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
-  %insert = insertelement <32 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
   %1 = sext <32 x i16> %op1 to <32 x i32>
   %2 = sext <32 x i16> %op2 to <32 x i32>
   %mul = mul <32 x i32> %1, %2
-  %shr = lshr <32 x i32> %mul, %splat
+  %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <32 x i32> %shr to <32 x i16>
   store <32 x i16> %res, <32 x i16>* %a
   ret void
 }
 
 define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_1024-LABEL: smulh_v64i16:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_EQ_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v64i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1sh { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sh { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
-
+; VBITS_GE_1024-LABEL: smulh_v64i16:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
-  %insert = insertelement <64 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
   %1 = sext <64 x i16> %op1 to <64 x i32>
   %2 = sext <64 x i16> %op2 to <64 x i32>
   %mul = mul <64 x i32> %1, %2
-  %shr = lshr <64 x i32> %mul, %splat
+  %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <64 x i32> %shr to <64 x i16>
   store <64 x i16> %res, <64 x i16>* %a
   ret void
@@ -358,12 +275,10 @@ define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 ; VBITS_GE_2048-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
-  %insert = insertelement <128 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
   %1 = sext <128 x i16> %op1 to <128 x i32>
   %2 = sext <128 x i16> %op2 to <128 x i32>
   %mul = mul <128 x i32> %1, %2
-  %shr = lshr <128 x i32> %mul, %splat
+  %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <128 x i32> %shr to <128 x i16>
   store <128 x i16> %res, <128 x i16>* %a
   ret void
@@ -379,12 +294,10 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
 ; CHECK-NEXT:    ret
-  %insert = insertelement <2 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
   %1 = sext <2 x i32> %op1 to <2 x i64>
   %2 = sext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
-  %shr = lshr <2 x i64> %mul, %splat
+  %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
   %res = trunc <2 x i64> %shr to <2 x i32>
   ret <2 x i32> %res
 }
@@ -397,110 +310,69 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 ; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
-  %insert = insertelement <4 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   %1 = sext <4 x i32> %op1 to <4 x i64>
   %2 = sext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
-  %shr = lshr <4 x i64> %mul, %splat
+  %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
   %res = trunc <4 x i64> %shr to <4 x i32>
   ret <4 x i32> %res
 }
 
 define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_256-LABEL: smulh_v8i32:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: smulh_v8i32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_512-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_512-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: smulh_v8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
-  %insert = insertelement <8 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
   %1 = sext <8 x i32> %op1 to <8 x i64>
   %2 = sext <8 x i32> %op2 to <8 x i64>
   %mul = mul <8 x i64> %1, %2
-  %shr = lshr <8 x i64> %mul, %splat
+  %shr = lshr <8 x i64> %mul,  <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   %res = trunc <8 x i64> %shr to <8 x i32>
   store <8 x i32> %res, <8 x i32>* %a
   ret void
 }
 
 define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_512-LABEL: smulh_v16i32:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.s, vl16
-; VBITS_EQ_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_512-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: smulh_v16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_1024-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: smulh_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
-  %insert = insertelement <16 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
   %1 = sext <16 x i32> %op1 to <16 x i64>
   %2 = sext <16 x i32> %op2 to <16 x i64>
   %mul = mul <16 x i64> %1, %2
-  %shr = lshr <16 x i64> %mul, %splat
+  %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   %res = trunc <16 x i64> %shr to <16 x i32>
   store <16 x i32> %res, <16 x i32>* %a
   ret void
 }
 
 define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_1024-LABEL: smulh_v32i32:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_EQ_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: smulh_v32i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1sw { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1sw { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
-
+; VBITS_GE_1024-LABEL: smulh_v32i32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
-  %insert = insertelement <32 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
   %1 = sext <32 x i32> %op1 to <32 x i64>
   %2 = sext <32 x i32> %op2 to <32 x i64>
   %mul = mul <32 x i64> %1, %2
-  %shr = lshr <32 x i64> %mul, %splat
+  %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   %res = trunc <32 x i64> %shr to <32 x i32>
   store <32 x i32> %res, <32 x i32>* %a
   ret void
@@ -517,12 +389,10 @@ define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 ; VBITS_GE_2048-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
-  %insert = insertelement <64 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
   %1 = sext <64 x i32> %op1 to <64 x i64>
   %2 = sext <64 x i32> %op2 to <64 x i64>
   %mul = mul <64 x i64> %1, %2
-  %shr = lshr <64 x i64> %mul, %splat
+  %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   %res = trunc <64 x i64> %shr to <64 x i32>
   store <64 x i32> %res, <64 x i32>* %a
   ret void
@@ -558,12 +428,10 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-  %insert = insertelement <2 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
-  %shr = lshr <2 x i128> %mul, %splat
+  %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
   %res = trunc <2 x i128> %shr to <2 x i64>
   ret <2 x i64> %res
 }
@@ -579,12 +447,10 @@ define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
-  %insert = insertelement <4 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
   %1 = sext <4 x i64> %op1 to <4 x i128>
   %2 = sext <4 x i64> %op2 to <4 x i128>
   %mul = mul <4 x i128> %1, %2
-  %shr = lshr <4 x i128> %mul, %splat
+  %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
   %res = trunc <4 x i128> %shr to <4 x i64>
   store <4 x i64> %res, <4 x i64>* %a
   ret void
@@ -601,12 +467,10 @@ define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
-  %insert = insertelement <8 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
   %1 = sext <8 x i64> %op1 to <8 x i128>
   %2 = sext <8 x i64> %op2 to <8 x i128>
   %mul = mul <8 x i128> %1, %2
-  %shr = lshr <8 x i128> %mul, %splat
+  %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
   %res = trunc <8 x i128> %shr to <8 x i64>
   store <8 x i64> %res, <8 x i64>* %a
   ret void
@@ -623,12 +487,10 @@ define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
 ; VBITS_GE_1024-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
-  %insert = insertelement <16 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
   %1 = sext <16 x i64> %op1 to <16 x i128>
   %2 = sext <16 x i64> %op2 to <16 x i128>
   %mul = mul <16 x i128> %1, %2
-  %shr = lshr <16 x i128> %mul, %splat
+  %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
   %res = trunc <16 x i128> %shr to <16 x i64>
   store <16 x i64> %res, <16 x i64>* %a
   ret void
@@ -645,12 +507,10 @@ define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ; VBITS_GE_2048-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
-  %insert = insertelement <32 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
   %1 = sext <32 x i64> %op1 to <32 x i128>
   %2 = sext <32 x i64> %op2 to <32 x i128>
   %mul = mul <32 x i128> %1, %2
-  %shr = lshr <32 x i128> %mul, %splat
+  %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
   %res = trunc <32 x i128> %shr to <32 x i64>
   store <32 x i64> %res, <32 x i64>* %a
   ret void
@@ -684,12 +544,10 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-NEXT:    mov v0.b[7], w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
-  %insert = insertelement <8 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   %1 = zext <8 x i8> %op1 to <8 x i16>
   %2 = zext <8 x i8> %op2 to <8 x i16>
   %mul = mul <8 x i16> %1, %2
-  %shr = lshr <8 x i16> %mul, %splat
+  %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <8 x i16> %shr to <8 x i8>
   ret <8 x i8> %res
 }
@@ -702,101 +560,63 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %insert = insertelement <16 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   %1 = zext <16 x i8> %op1 to <16 x i16>
   %2 = zext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
-  %shr = lshr <16 x i16> %mul, %splat
+  %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <16 x i16> %shr to <16 x i8>
   ret <16 x i8> %res
 }
 
 define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
-; VBITS_EQ_256-LABEL: umulh_v32i8:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.b, vl32
-; VBITS_EQ_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_256-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: umulh_v32i8:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1b { z1.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_512-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_512-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: umulh_v32i8:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
-  %insert = insertelement <32 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
   %1 = zext <32 x i8> %op1 to <32 x i16>
   %2 = zext <32 x i8> %op2 to <32 x i16>
   %mul = mul <32 x i16> %1, %2
-  %shr = lshr <32 x i16> %mul, %splat
+  %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <32 x i16> %shr to <32 x i8>
   store <32 x i8> %res, <32 x i8>* %a
   ret void
 }
 
 define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
-; VBITS_EQ_512-LABEL: umulh_v64i8:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.b, vl64
-; VBITS_EQ_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_512-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v64i8:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1b { z1.h }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_1024-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_1024-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: umulh_v64i8:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <64 x i8>, <64 x i8>* %a
   %op2 = load <64 x i8>, <64 x i8>* %b
-  %insert = insertelement <64 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
   %1 = zext <64 x i8> %op1 to <64 x i16>
   %2 = zext <64 x i8> %op2 to <64 x i16>
   %mul = mul <64 x i16> %1, %2
-  %shr = lshr <64 x i16> %mul, %splat
+  %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <64 x i16> %shr to <64 x i8>
   store <64 x i8> %res, <64 x i8>* %a
   ret void
 }
 
 define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
-; VBITS_EQ_1024-LABEL: umulh_v128i8:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.b, vl128
-; VBITS_EQ_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
-; VBITS_EQ_1024-NEXT:    st1b { z0.b }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v128i8:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1b { z1.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; VBITS_GE_2048-NEXT:    lsr z0.h, p0/m, z0.h, #8
-; VBITS_GE_2048-NEXT:    st1b { z0.h }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
+; VBITS_GE_1024-LABEL: umulh_v128i8:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.b, vl128
+; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; VBITS_GE_1024-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
 
   %op1 = load <128 x i8>, <128 x i8>* %a
   %op2 = load <128 x i8>, <128 x i8>* %b
@@ -805,7 +625,7 @@ define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
   %1 = zext <128 x i8> %op1 to <128 x i16>
   %2 = zext <128 x i8> %op2 to <128 x i16>
   %mul = mul <128 x i16> %1, %2
-  %shr = lshr <128 x i16> %mul, %splat
+  %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <128 x i16> %shr to <128 x i8>
   store <128 x i8> %res, <128 x i8>* %a
   ret void
@@ -822,12 +642,10 @@ define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 ; VBITS_GE_2048-NEXT:    ret
   %op1 = load <256 x i8>, <256 x i8>* %a
   %op2 = load <256 x i8>, <256 x i8>* %b
-  %insert = insertelement <256 x i16> undef, i16 8, i64 0
-  %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
   %1 = zext <256 x i8> %op1 to <256 x i16>
   %2 = zext <256 x i8> %op2 to <256 x i16>
   %mul = mul <256 x i16> %1, %2
-  %shr = lshr <256 x i16> %mul, %splat
+  %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %res = trunc <256 x i16> %shr to <256 x i8>
   store <256 x i8> %res, <256 x i8>* %a
   ret void
@@ -848,12 +666,10 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-NEXT:    mov v0.h[3], w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
-  %insert = insertelement <4 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
   %1 = zext <4 x i16> %op1 to <4 x i32>
   %2 = zext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
-  %shr = lshr <4 x i32> %mul, %splat
+  %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
   %res = trunc <4 x i32> %shr to <4 x i16>
   ret <4 x i16> %res
 }
@@ -866,110 +682,69 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
-  %insert = insertelement <8 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   %1 = zext <8 x i16> %op1 to <8 x i32>
   %2 = zext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
-  %shr = lshr <8 x i32> %mul, %splat
+  %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <8 x i32> %shr to <8 x i16>
   ret <8 x i16> %res
 }
 
 define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
-; VBITS_EQ_256-LABEL: umulh_v16i16:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: umulh_v16i16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1h { z1.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_512-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_512-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: umulh_v16i16:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
-  %insert = insertelement <16 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
   %1 = zext <16 x i16> %op1 to <16 x i32>
   %2 = zext <16 x i16> %op2 to <16 x i32>
   %mul = mul <16 x i32> %1, %2
-  %shr = lshr <16 x i32> %mul, %splat
+  %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <16 x i32> %shr to <16 x i16>
   store <16 x i16> %res, <16 x i16>* %a
   ret void
 }
 
 define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
-; VBITS_EQ_512-LABEL: umulh_v32i16:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.h, vl32
-; VBITS_EQ_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_512-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v32i16:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1h { z1.s }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_1024-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_1024-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: umulh_v32i16:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i16>, <32 x i16>* %a
   %op2 = load <32 x i16>, <32 x i16>* %b
-  %insert = insertelement <32 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
   %1 = zext <32 x i16> %op1 to <32 x i32>
   %2 = zext <32 x i16> %op2 to <32 x i32>
   %mul = mul <32 x i32> %1, %2
-  %shr = lshr <32 x i32> %mul, %splat
+  %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <32 x i32> %shr to <32 x i16>
   store <32 x i16> %res, <32 x i16>* %a
   ret void
 }
 
 define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
-; VBITS_EQ_1024-LABEL: umulh_v64i16:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_EQ_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
-; VBITS_EQ_1024-NEXT:    st1h { z0.h }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v64i16:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1h { z1.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_2048-NEXT:    lsr z0.s, p0/m, z0.s, #16
-; VBITS_GE_2048-NEXT:    st1h { z0.s }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
-
+; VBITS_GE_1024-LABEL: umulh_v64i16:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
+; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; VBITS_GE_1024-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
   %op1 = load <64 x i16>, <64 x i16>* %a
   %op2 = load <64 x i16>, <64 x i16>* %b
-  %insert = insertelement <64 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
   %1 = zext <64 x i16> %op1 to <64 x i32>
   %2 = zext <64 x i16> %op2 to <64 x i32>
   %mul = mul <64 x i32> %1, %2
-  %shr = lshr <64 x i32> %mul, %splat
+  %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <64 x i32> %shr to <64 x i16>
   store <64 x i16> %res, <64 x i16>* %a
   ret void
@@ -986,12 +761,10 @@ define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 ; VBITS_GE_2048-NEXT:    ret
   %op1 = load <128 x i16>, <128 x i16>* %a
   %op2 = load <128 x i16>, <128 x i16>* %b
-  %insert = insertelement <128 x i32> undef, i32 16, i64 0
-  %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
   %1 = zext <128 x i16> %op1 to <128 x i32>
   %2 = zext <128 x i16> %op2 to <128 x i32>
   %mul = mul <128 x i32> %1, %2
-  %shr = lshr <128 x i32> %mul, %splat
+  %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %res = trunc <128 x i32> %shr to <128 x i16>
   store <128 x i16> %res, <128 x i16>* %a
   ret void
@@ -1007,12 +780,10 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
 ; CHECK-NEXT:    ret
-  %insert = insertelement <2 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
   %1 = zext <2 x i32> %op1 to <2 x i64>
   %2 = zext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
-  %shr = lshr <2 x i64> %mul, %splat
+  %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
   %res = trunc <2 x i64> %shr to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1025,36 +796,23 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 ; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
-  %insert = insertelement <4 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   %1 = zext <4 x i32> %op1 to <4 x i64>
   %2 = zext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
-  %shr = lshr <4 x i64> %mul, %splat
+  %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
   %res = trunc <4 x i64> %shr to <4 x i32>
   ret <4 x i32> %res
 }
 
 define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
-; VBITS_EQ_256-LABEL: umulh_v8i32:
-; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_256-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_256-NEXT:    ret
-;
-; VBITS_GE_512-LABEL: umulh_v8i32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_512-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_512-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_512-NEXT:    ret
-
+; VBITS_GE_256-LABEL: umulh_v8i32:
+; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    ret
   %op1 = load <8 x i32>, <8 x i32>* %a
   %op2 = load <8 x i32>, <8 x i32>* %b
   %insert = insertelement <8 x i64> undef, i64 32, i64 0
@@ -1062,73 +820,47 @@ define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
   %1 = zext <8 x i32> %op1 to <8 x i64>
   %2 = zext <8 x i32> %op2 to <8 x i64>
   %mul = mul <8 x i64> %1, %2
-  %shr = lshr <8 x i64> %mul, %splat
+  %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   %res = trunc <8 x i64> %shr to <8 x i32>
   store <8 x i32> %res, <8 x i32>* %a
   ret void
 }
 
 define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
-; VBITS_EQ_512-LABEL: umulh_v16i32:
-; VBITS_EQ_512:       // %bb.0:
-; VBITS_EQ_512-NEXT:    ptrue p0.s, vl16
-; VBITS_EQ_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_512-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_512-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_512-NEXT:    ret
-;
-; VBITS_GE_1024-LABEL: umulh_v16i32:
-; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT:    ld1w { z1.d }, p0/z, [x1]
-; VBITS_GE_1024-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_1024-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_1024-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_1024-NEXT:    ret
-
+; VBITS_GE_512-LABEL: umulh_v16i32:
+; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
-  %insert = insertelement <16 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
   %1 = zext <16 x i32> %op1 to <16 x i64>
   %2 = zext <16 x i32> %op2 to <16 x i64>
   %mul = mul <16 x i64> %1, %2
-  %shr = lshr <16 x i64> %mul, %splat
+  %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   %res = trunc <16 x i64> %shr to <16 x i32>
   store <16 x i32> %res, <16 x i32>* %a
   ret void
 }
 
 define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
-; VBITS_EQ_1024-LABEL: umulh_v32i32:
-; VBITS_EQ_1024:       // %bb.0:
-; VBITS_EQ_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_EQ_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_EQ_1024-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; VBITS_EQ_1024-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_EQ_1024-NEXT:    ret
-;
-; VBITS_GE_2048-LABEL: umulh_v32i32:
-; VBITS_GE_2048:       // %bb.0:
-; VBITS_GE_2048-NEXT:    ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT:    ld1w { z1.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; VBITS_GE_2048-NEXT:    lsr z0.d, p0/m, z0.d, #32
-; VBITS_GE_2048-NEXT:    st1w { z0.d }, p0, [x0]
-; VBITS_GE_2048-NEXT:    ret
-
+; VBITS_GE_1024-LABEL: umulh_v32i32:
+; VBITS_GE_1024:       // %bb.0:
+; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_1024-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT:    ret
   %op1 = load <32 x i32>, <32 x i32>* %a
   %op2 = load <32 x i32>, <32 x i32>* %b
-  %insert = insertelement <32 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
   %1 = zext <32 x i32> %op1 to <32 x i64>
   %2 = zext <32 x i32> %op2 to <32 x i64>
   %mul = mul <32 x i64> %1, %2
-  %shr = lshr <32 x i64> %mul, %splat
+  %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   %res = trunc <32 x i64> %shr to <32 x i32>
   store <32 x i32> %res, <32 x i32>* %a
   ret void
@@ -1145,12 +877,10 @@ define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 ; VBITS_GE_2048-NEXT:    ret
   %op1 = load <64 x i32>, <64 x i32>* %a
   %op2 = load <64 x i32>, <64 x i32>* %b
-  %insert = insertelement <64 x i64> undef, i64 32, i64 0
-  %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
   %1 = zext <64 x i32> %op1 to <64 x i64>
   %2 = zext <64 x i32> %op2 to <64 x i64>
   %mul = mul <64 x i64> %1, %2
-  %shr = lshr <64 x i64> %mul, %splat
+  %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
   %res = trunc <64 x i64> %shr to <64 x i32>
   store <64 x i32> %res, <64 x i32>* %a
   ret void
@@ -1166,12 +896,10 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
-  %insert = insertelement <1 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
   %1 = zext <1 x i64> %op1 to <1 x i128>
   %2 = zext <1 x i64> %op2 to <1 x i128>
   %mul = mul <1 x i128> %1, %2
-  %shr = lshr <1 x i128> %mul, %splat
+  %shr = lshr <1 x i128> %mul, <i128 64>
   %res = trunc <1 x i128> %shr to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1186,12 +914,10 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
-  %insert = insertelement <2 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
-  %shr = lshr <2 x i128> %mul, %splat
+  %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
   %res = trunc <2 x i128> %shr to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1207,12 +933,10 @@ define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, <4 x i64>* %a
   %op2 = load <4 x i64>, <4 x i64>* %b
-  %insert = insertelement <4 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
   %1 = zext <4 x i64> %op1 to <4 x i128>
   %2 = zext <4 x i64> %op2 to <4 x i128>
   %mul = mul <4 x i128> %1, %2
-  %shr = lshr <4 x i128> %mul, %splat
+  %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
   %res = trunc <4 x i128> %shr to <4 x i64>
   store <4 x i64> %res, <4 x i64>* %a
   ret void
@@ -1229,12 +953,10 @@ define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i64>, <8 x i64>* %a
   %op2 = load <8 x i64>, <8 x i64>* %b
-  %insert = insertelement <8 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
   %1 = zext <8 x i64> %op1 to <8 x i128>
   %2 = zext <8 x i64> %op2 to <8 x i128>
   %mul = mul <8 x i128> %1, %2
-  %shr = lshr <8 x i128> %mul, %splat
+  %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
   %res = trunc <8 x i128> %shr to <8 x i64>
   store <8 x i64> %res, <8 x i64>* %a
   ret void
@@ -1251,12 +973,10 @@ define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
 ; VBITS_GE_1024-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %op2 = load <16 x i64>, <16 x i64>* %b
-  %insert = insertelement <16 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
   %1 = zext <16 x i64> %op1 to <16 x i128>
   %2 = zext <16 x i64> %op2 to <16 x i128>
   %mul = mul <16 x i128> %1, %2
-  %shr = lshr <16 x i128> %mul, %splat
+  %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
   %res = trunc <16 x i128> %shr to <16 x i64>
   store <16 x i64> %res, <16 x i64>* %a
   ret void
@@ -1273,12 +993,10 @@ define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 ; VBITS_GE_2048-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %op2 = load <32 x i64>, <32 x i64>* %b
-  %insert = insertelement <32 x i128> undef, i128 64, i128 0
-  %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
   %1 = zext <32 x i64> %op1 to <32 x i128>
   %2 = zext <32 x i64> %op2 to <32 x i128>
   %mul = mul <32 x i128> %1, %2
-  %shr = lshr <32 x i128> %mul, %splat
+  %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
   %res = trunc <32 x i128> %shr to <32 x i64>
   store <32 x i64> %res, <32 x i64>* %a
   ret void