[llvm] 2b5e2d7 - [AArch64][GlobalISel] Extend arm64-vshift.ll test coverage. NFC

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 25 01:03:56 PST 2024


Author: David Green
Date: 2024-11-25T09:03:50Z
New Revision: 2b5e2d74d38274e783ccf0de37aa106c76816f9e

URL: https://github.com/llvm/llvm-project/commit/2b5e2d74d38274e783ccf0de37aa106c76816f9e
DIFF: https://github.com/llvm/llvm-project/commit/2b5e2d74d38274e783ccf0de37aa106c76816f9e.diff

LOG: [AArch64][GlobalISel] Extend arm64-vshift.ll test coverage. NFC

Added: 
    

Modified: 
    llvm/test/CodeGen/AArch64/arm64-vshift.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
index 7af7c235f9ac16..2f543cc324bc22 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1,12 +1,114 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel=1 -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for sqshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshl1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshl1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshl_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshl_scalar_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshr1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for urshr_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshr1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srshr_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu2d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu1d_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu_i64_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu_i32_constant
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrn4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshrun4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrn4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqrshrun4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqrshrn4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn1s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uqshrn4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_ushl_vscalar_constant_shift
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_ushl_scalar_constant_shift
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_sshll_vscalar_constant_shift
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_sshll_scalar_constant_shift
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for neon_sshll_scalar_constant_shift_m1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for ursra1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for ursra_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srsra1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for srsra_scalar
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli8b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli4h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli2s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli1d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli16b
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli8h
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli4s
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sli2d
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqshlu_zero_shift_amount
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for lshr_trunc_v2i64_v2i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for ashr_trunc_v2i64_v2i8
 
 define <8 x i8> @sqshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: sqshl8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqshl.8b v0, v0, v1
+; CHECK-NEXT:    sqshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -19,7 +121,7 @@ define <4 x i16> @sqshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqshl.4h v0, v0, v1
+; CHECK-NEXT:    sqshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -32,7 +134,7 @@ define <2 x i32> @sqshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqshl.2s v0, v0, v1
+; CHECK-NEXT:    sqshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -97,7 +199,7 @@ define <8 x i8> @uqshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqshl.8b v0, v0, v1
+; CHECK-NEXT:    uqshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -110,7 +212,7 @@ define <4 x i16> @uqshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqshl.4h v0, v0, v1
+; CHECK-NEXT:    uqshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -123,7 +225,7 @@ define <2 x i32> @uqshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqshl.2s v0, v0, v1
+; CHECK-NEXT:    uqshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -136,7 +238,7 @@ define <16 x i8> @sqshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshl.16b v0, v0, v1
+; CHECK-NEXT:    sqshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -149,7 +251,7 @@ define <8 x i16> @sqshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshl.8h v0, v0, v1
+; CHECK-NEXT:    sqshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -162,7 +264,7 @@ define <4 x i32> @sqshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshl.4s v0, v0, v1
+; CHECK-NEXT:    sqshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -175,7 +277,7 @@ define <2 x i64> @sqshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshl.2d v0, v0, v1
+; CHECK-NEXT:    sqshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -188,7 +290,7 @@ define <16 x i8> @uqshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshl.16b v0, v0, v1
+; CHECK-NEXT:    uqshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -201,7 +303,7 @@ define <8 x i16> @uqshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshl.8h v0, v0, v1
+; CHECK-NEXT:    uqshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -214,7 +316,7 @@ define <4 x i32> @uqshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshl.4s v0, v0, v1
+; CHECK-NEXT:    uqshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -227,7 +329,7 @@ define <2 x i64> @uqshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshl.2d v0, v0, v1
+; CHECK-NEXT:    uqshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -315,7 +417,7 @@ define <8 x i8> @srshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    srshl.8b v0, v0, v1
+; CHECK-NEXT:    srshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -328,7 +430,7 @@ define <4 x i16> @srshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    srshl.4h v0, v0, v1
+; CHECK-NEXT:    srshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -341,7 +443,7 @@ define <2 x i32> @srshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    srshl.2s v0, v0, v1
+; CHECK-NEXT:    srshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -394,10 +496,10 @@ define i64 @srshl_scalar(ptr %A, ptr %B) nounwind {
 define i64 @srshl_scalar_constant(ptr %A) nounwind {
 ; CHECK-LABEL: srshl_scalar_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    fmov d0, x9
 ; CHECK-NEXT:    srshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -411,7 +513,7 @@ define <8 x i8> @urshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    urshl.8b v0, v0, v1
+; CHECK-NEXT:    urshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -424,7 +526,7 @@ define <4 x i16> @urshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    urshl.4h v0, v0, v1
+; CHECK-NEXT:    urshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -437,7 +539,7 @@ define <2 x i32> @urshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    urshl.2s v0, v0, v1
+; CHECK-NEXT:    urshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -490,10 +592,10 @@ define i64 @urshl_scalar(ptr %A, ptr %B) nounwind {
 define i64 @urshl_scalar_constant(ptr %A) nounwind {
 ; CHECK-LABEL: urshl_scalar_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    fmov d0, x9
 ; CHECK-NEXT:    urshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -507,7 +609,7 @@ define <16 x i8> @srshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    srshl.16b v0, v0, v1
+; CHECK-NEXT:    srshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -520,7 +622,7 @@ define <8 x i16> @srshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    srshl.8h v0, v0, v1
+; CHECK-NEXT:    srshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -533,7 +635,7 @@ define <4 x i32> @srshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    srshl.4s v0, v0, v1
+; CHECK-NEXT:    srshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -546,7 +648,7 @@ define <2 x i64> @srshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    srshl.2d v0, v0, v1
+; CHECK-NEXT:    srshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -559,7 +661,7 @@ define <16 x i8> @urshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    urshl.16b v0, v0, v1
+; CHECK-NEXT:    urshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -572,7 +674,7 @@ define <8 x i16> @urshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    urshl.8h v0, v0, v1
+; CHECK-NEXT:    urshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -585,7 +687,7 @@ define <4 x i32> @urshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    urshl.4s v0, v0, v1
+; CHECK-NEXT:    urshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -598,7 +700,7 @@ define <2 x i64> @urshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    urshl.2d v0, v0, v1
+; CHECK-NEXT:    urshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -633,7 +735,7 @@ define <8 x i8> @sqrshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqrshl.8b v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -646,7 +748,7 @@ define <4 x i16> @sqrshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqrshl.4h v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -659,7 +761,7 @@ define <2 x i32> @sqrshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqrshl.2s v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -672,7 +774,7 @@ define <8 x i8> @uqrshl8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqrshl.8b v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -685,7 +787,7 @@ define <4 x i16> @uqrshl4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqrshl.4h v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -698,7 +800,7 @@ define <2 x i32> @uqrshl2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqrshl.2s v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -711,7 +813,7 @@ define <16 x i8> @sqrshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshl.16b v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -724,7 +826,7 @@ define <8 x i16> @sqrshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshl.8h v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -737,7 +839,7 @@ define <4 x i32> @sqrshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshl.4s v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -750,7 +852,7 @@ define <2 x i64> @sqrshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshl.2d v0, v0, v1
+; CHECK-NEXT:    sqrshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -803,10 +905,10 @@ define i64 @sqrshl_scalar(ptr %A, ptr %B) nounwind {
 define i64 @sqrshl_scalar_constant(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshl_scalar_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    fmov d0, x9
 ; CHECK-NEXT:    sqrshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -820,7 +922,7 @@ define <16 x i8> @uqrshl16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshl.16b v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -833,7 +935,7 @@ define <8 x i16> @uqrshl8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshl.8h v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -846,7 +948,7 @@ define <4 x i32> @uqrshl4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshl.4s v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -859,7 +961,7 @@ define <2 x i64> @uqrshl2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshl.2d v0, v0, v1
+; CHECK-NEXT:    uqrshl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -912,10 +1014,10 @@ define i64 @uqrshl_scalar(ptr %A, ptr %B) nounwind {
 define i64 @uqrshl_scalar_constant(ptr %A) nounwind {
 ; CHECK-LABEL: uqrshl_scalar_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    fmov d0, x9
 ; CHECK-NEXT:    uqrshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -947,77 +1049,126 @@ declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind
 declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @urshr8b(ptr %A) nounwind {
-; CHECK-LABEL: urshr8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    urshr.8b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    urshr v0.8b, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @urshr4h(ptr %A) nounwind {
-; CHECK-LABEL: urshr4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    urshr.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    urshr v0.4h, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @urshr2s(ptr %A) nounwind {
-; CHECK-LABEL: urshr2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    urshr.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    urshr v0.2s, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @urshr16b(ptr %A) nounwind {
-; CHECK-LABEL: urshr16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    urshr.16b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    urshr v0.16b, v0.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @urshr8h(ptr %A) nounwind {
-; CHECK-LABEL: urshr8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    urshr.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    urshr v0.8h, v0.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @urshr4s(ptr %A) nounwind {
-; CHECK-LABEL: urshr4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    urshr.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    urshr v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @urshr2d(ptr %A) nounwind {
-; CHECK-LABEL: urshr2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    urshr.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshr2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    urshr v0.2d, v0.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshr2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
   ret <2 x i64> %tmp3
@@ -1047,77 +1198,126 @@ define i64 @urshr_scalar(ptr %A) nounwind {
 }
 
 define <8 x i8> @srshr8b(ptr %A) nounwind {
-; CHECK-LABEL: srshr8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    srshr.8b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    srshr v0.8b, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @srshr4h(ptr %A) nounwind {
-; CHECK-LABEL: srshr4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    srshr.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    srshr v0.4h, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @srshr2s(ptr %A) nounwind {
-; CHECK-LABEL: srshr2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    srshr.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    srshr v0.2s, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @srshr16b(ptr %A) nounwind {
-; CHECK-LABEL: srshr16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    srshr.16b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    srshr v0.16b, v0.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @srshr8h(ptr %A) nounwind {
-; CHECK-LABEL: srshr8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    srshr.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    srshr v0.8h, v0.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @srshr4s(ptr %A) nounwind {
-; CHECK-LABEL: srshr4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    srshr.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    srshr v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @srshr2d(ptr %A) nounwind {
-; CHECK-LABEL: srshr2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    srshr.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshr2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    srshr v0.2d, v0.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshr2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
   ret <2 x i64> %tmp3
@@ -1150,7 +1350,7 @@ define <8 x i8> @sqshlu8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshlu.8b v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.8b, v0.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
@@ -1161,7 +1361,7 @@ define <4 x i16> @sqshlu4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshlu.4h v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.4h, v0.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
@@ -1172,7 +1372,7 @@ define <2 x i32> @sqshlu2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshlu.2s v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.2s, v0.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
@@ -1183,7 +1383,7 @@ define <16 x i8> @sqshlu16b(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu16b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshlu.16b v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.16b, v0.16b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
@@ -1194,7 +1394,7 @@ define <8 x i16> @sqshlu8h(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu8h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshlu.8h v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.8h, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
@@ -1205,7 +1405,7 @@ define <4 x i32> @sqshlu4s(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu4s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshlu.4s v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.4s, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -1216,7 +1416,7 @@ define <2 x i64> @sqshlu2d(ptr %A) nounwind {
 ; CHECK-LABEL: sqshlu2d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshlu.2d v0, v0, #1
+; CHECK-NEXT:    sqshlu v0.2d, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
@@ -1275,7 +1475,7 @@ define <8 x i8> @rshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: rshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    rshrn.8b v0, v0, #1
+; CHECK-NEXT:    rshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1286,7 +1486,7 @@ define <4 x i16> @rshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: rshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    rshrn.4h v0, v0, #1
+; CHECK-NEXT:    rshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1297,7 +1497,7 @@ define <2 x i32> @rshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: rshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    rshrn.2s v0, v0, #1
+; CHECK-NEXT:    rshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1309,7 +1509,7 @@ define <16 x i8> @rshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    rshrn2.16b v0, v1, #1
+; CHECK-NEXT:    rshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1323,7 +1523,7 @@ define <8 x i16> @rshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    rshrn2.8h v0, v1, #1
+; CHECK-NEXT:    rshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1337,7 +1537,7 @@ define <4 x i32> @rshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    rshrn2.4s v0, v1, #1
+; CHECK-NEXT:    rshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1354,7 +1554,7 @@ define <8 x i8> @shrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: shrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    shrn.8b v0, v0, #1
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -1366,7 +1566,7 @@ define <4 x i16> @shrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: shrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    shrn.4h v0, v0, #1
+; CHECK-NEXT:    shrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
@@ -1378,7 +1578,7 @@ define <2 x i32> @shrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: shrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    shrn.2s v0, v0, #1
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
@@ -1391,7 +1591,7 @@ define <16 x i8> @shrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    shrn2.16b v0, v1, #1
+; CHECK-NEXT:    shrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1406,7 +1606,7 @@ define <8 x i16> @shrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    shrn2.8h v0, v1, #1
+; CHECK-NEXT:    shrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1421,7 +1621,7 @@ define <4 x i32> @shrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    shrn2.4s v0, v1, #1
+; CHECK-NEXT:    shrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1450,7 +1650,7 @@ define <8 x i8> @sqshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrn.8b v0, v0, #1
+; CHECK-NEXT:    sqshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1461,7 +1661,7 @@ define <4 x i16> @sqshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrn.4h v0, v0, #1
+; CHECK-NEXT:    sqshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1472,7 +1672,7 @@ define <2 x i32> @sqshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrn.2s v0, v0, #1
+; CHECK-NEXT:    sqshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1485,7 +1685,7 @@ define <16 x i8> @sqshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrn2.16b v0, v1, #1
+; CHECK-NEXT:    sqshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1499,7 +1699,7 @@ define <8 x i16> @sqshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrn2.8h v0, v1, #1
+; CHECK-NEXT:    sqshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1513,7 +1713,7 @@ define <4 x i32> @sqshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrn2.4s v0, v1, #1
+; CHECK-NEXT:    sqshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1542,7 +1742,7 @@ define <8 x i8> @sqshrun8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrun8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrun.8b v0, v0, #1
+; CHECK-NEXT:    sqshrun v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1553,7 +1753,7 @@ define <4 x i16> @sqshrun4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrun4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrun.4h v0, v0, #1
+; CHECK-NEXT:    sqshrun v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1564,7 +1764,7 @@ define <2 x i32> @sqshrun2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqshrun2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshrun.2s v0, v0, #1
+; CHECK-NEXT:    sqshrun v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1576,7 +1776,7 @@ define <16 x i8> @sqshrun16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrun2.16b v0, v1, #1
+; CHECK-NEXT:    sqshrun2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1590,7 +1790,7 @@ define <8 x i16> @sqshrun8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrun2.8h v0, v1, #1
+; CHECK-NEXT:    sqshrun2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1604,7 +1804,7 @@ define <4 x i32> @sqshrun4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqshrun2.4s v0, v1, #1
+; CHECK-NEXT:    sqshrun2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1633,7 +1833,7 @@ define <8 x i8> @sqrshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrn.8b v0, v0, #1
+; CHECK-NEXT:    sqrshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1644,7 +1844,7 @@ define <4 x i16> @sqrshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrn.4h v0, v0, #1
+; CHECK-NEXT:    sqrshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1655,7 +1855,7 @@ define <2 x i32> @sqrshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrn.2s v0, v0, #1
+; CHECK-NEXT:    sqrshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1667,7 +1867,7 @@ define <16 x i8> @sqrshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrn2.16b v0, v1, #1
+; CHECK-NEXT:    sqrshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1681,7 +1881,7 @@ define <8 x i16> @sqrshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrn2.8h v0, v1, #1
+; CHECK-NEXT:    sqrshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1695,7 +1895,7 @@ define <4 x i32> @sqrshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrn2.4s v0, v1, #1
+; CHECK-NEXT:    sqrshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1724,7 +1924,7 @@ define <8 x i8> @sqrshrun8b(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrun8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrun.8b v0, v0, #1
+; CHECK-NEXT:    sqrshrun v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1735,7 +1935,7 @@ define <4 x i16> @sqrshrun4h(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrun4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrun.4h v0, v0, #1
+; CHECK-NEXT:    sqrshrun v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1746,7 +1946,7 @@ define <2 x i32> @sqrshrun2s(ptr %A) nounwind {
 ; CHECK-LABEL: sqrshrun2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqrshrun.2s v0, v0, #1
+; CHECK-NEXT:    sqrshrun v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1758,7 +1958,7 @@ define <16 x i8> @sqrshrun16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrun2.16b v0, v1, #1
+; CHECK-NEXT:    sqrshrun2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1772,7 +1972,7 @@ define <8 x i16> @sqrshrun8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrun2.8h v0, v1, #1
+; CHECK-NEXT:    sqrshrun2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1786,7 +1986,7 @@ define <4 x i32> @sqrshrun4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sqrshrun2.4s v0, v1, #1
+; CHECK-NEXT:    sqrshrun2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1815,7 +2015,7 @@ define <8 x i8> @uqrshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: uqrshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqrshrn.8b v0, v0, #1
+; CHECK-NEXT:    uqrshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1826,7 +2026,7 @@ define <4 x i16> @uqrshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: uqrshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqrshrn.4h v0, v0, #1
+; CHECK-NEXT:    uqrshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1837,7 +2037,7 @@ define <2 x i32> @uqrshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: uqrshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqrshrn.2s v0, v0, #1
+; CHECK-NEXT:    uqrshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1849,7 +2049,7 @@ define <16 x i8> @uqrshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshrn2.16b v0, v1, #1
+; CHECK-NEXT:    uqrshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1863,7 +2063,7 @@ define <8 x i16> @uqrshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshrn2.8h v0, v1, #1
+; CHECK-NEXT:    uqrshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1877,7 +2077,7 @@ define <4 x i32> @uqrshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqrshrn2.4s v0, v1, #1
+; CHECK-NEXT:    uqrshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1906,7 +2106,7 @@ define <8 x i8> @uqshrn8b(ptr %A) nounwind {
 ; CHECK-LABEL: uqshrn8b:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshrn.8b v0, v0, #1
+; CHECK-NEXT:    uqshrn v0.8b, v0.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
@@ -1917,7 +2117,7 @@ define <4 x i16> @uqshrn4h(ptr %A) nounwind {
 ; CHECK-LABEL: uqshrn4h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshrn.4h v0, v0, #1
+; CHECK-NEXT:    uqshrn v0.4h, v0.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
@@ -1928,7 +2128,7 @@ define <2 x i32> @uqshrn2s(ptr %A) nounwind {
 ; CHECK-LABEL: uqshrn2s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshrn.2s v0, v0, #1
+; CHECK-NEXT:    uqshrn v0.2s, v0.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
@@ -1940,7 +2140,7 @@ define <16 x i8> @uqshrn16b(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshrn2.16b v0, v1, #1
+; CHECK-NEXT:    uqshrn2 v0.16b, v1.8h, #1
 ; CHECK-NEXT:    ret
   %out = load <8 x i8>, ptr %ret
   %tmp1 = load <8 x i16>, ptr %A
@@ -1954,7 +2154,7 @@ define <8 x i16> @uqshrn8h(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshrn2.8h v0, v1, #1
+; CHECK-NEXT:    uqshrn2 v0.8h, v1.4s, #1
 ; CHECK-NEXT:    ret
   %out = load <4 x i16>, ptr %ret
   %tmp1 = load <4 x i32>, ptr %A
@@ -1968,7 +2168,7 @@ define <4 x i32> @uqshrn4s(ptr %ret, ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    uqshrn2.4s v0, v1, #1
+; CHECK-NEXT:    uqshrn2 v0.4s, v1.2d, #1
 ; CHECK-NEXT:    ret
   %out = load <2 x i32>, ptr %ret
   %tmp1 = load <2 x i64>, ptr %A
@@ -1986,7 +2186,7 @@ define <8 x i16> @ushll8h(ptr %A) nounwind {
 ; CHECK-LABEL: ushll8h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.8h v0, v0, #1
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -1998,7 +2198,7 @@ define <4 x i32> @ushll4s(ptr %A) nounwind {
 ; CHECK-LABEL: ushll4s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.4s v0, v0, #1
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -2010,7 +2210,7 @@ define <2 x i64> @ushll2d(ptr %A) nounwind {
 ; CHECK-LABEL: ushll2d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.2d v0, v0, #1
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -2019,11 +2219,18 @@ define <2 x i64> @ushll2d(ptr %A) nounwind {
 }
 
 define <8 x i16> @ushll2_8h(ptr %A) nounwind {
-; CHECK-LABEL: ushll2_8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    ushll.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ushll2_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ushll2_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <16 x i8>, ptr %A
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -2032,11 +2239,18 @@ define <8 x i16> @ushll2_8h(ptr %A) nounwind {
 }
 
 define <4 x i32> @ushll2_4s(ptr %A) nounwind {
-; CHECK-LABEL: ushll2_4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    ushll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ushll2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ushll2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <8 x i16>, ptr %A
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -2045,11 +2259,18 @@ define <4 x i32> @ushll2_4s(ptr %A) nounwind {
 }
 
 define <2 x i64> @ushll2_2d(ptr %A) nounwind {
-; CHECK-LABEL: ushll2_2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    ushll.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ushll2_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ushll2_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <4 x i32>, ptr %A
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -2064,24 +2285,32 @@ declare <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64>, <2 x i64>)
 declare <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64>, <1 x i64>)
 declare i64 @llvm.aarch64.neon.ushl.i64(i64, i64)
 
-define <8 x i16> @neon.ushll8h_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushll8h_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.8h v0, v0, #1
-; CHECK-NEXT:    ret
+define <8 x i16> @neon_ushll8h_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushll8h_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll8h_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.8h, #1
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp2, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
-define <8 x i16> @neon.ushl8h_no_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl8h_no_constant_shift:
+define <8 x i16> @neon_ushl8h_no_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_ushl8h_no_constant_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushl.8h v0, v0, v0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -2089,36 +2318,76 @@ define <8 x i16> @neon.ushl8h_no_constant_shift(ptr %A) nounwind {
   ret <8 x i16> %tmp3
 }
 
-define <4 x i32> @neon.ushl8h_constant_shift_extend_not_2x(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl8h_constant_shift_extend_not_2x:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_ushl8h_constant_shift_extend_not_2x(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushl8h_constant_shift_extend_not_2x:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushl8h_constant_shift_extend_not_2x:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    uxtb w8, w8
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v1.b[3]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w10, s3
+; CHECK-GI-NEXT:    fmov w11, s4
+; CHECK-GI-NEXT:    uxtb w9, w9
+; CHECK-GI-NEXT:    uxtb w10, w10
+; CHECK-GI-NEXT:    uxtb w11, w11
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v1.h[1], w10
+; CHECK-GI-NEXT:    mov v2.h[1], w11
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i8>, ptr %A
   %tmp2 = zext <4 x i8> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <8 x i16> @neon.ushl8_noext_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl8_noext_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    add.8h v0, v0, v0
-; CHECK-NEXT:    ret
+define <8 x i16> @neon_ushl8_noext_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushl8_noext_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushl8_noext_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8h, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ushl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
-define <4 x i32> @neon.ushll4s_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushll4s_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_ushll4s_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushll4s_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll4s_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -2126,13 +2395,21 @@ define <4 x i32> @neon.ushll4s_constant_shift(ptr %A) nounwind {
 }
 
 ; FIXME: unnecessary ushll.4s v0, v0, #0?
-define <4 x i32> @neon.ushll4s_neg_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushll4s_neg_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushr.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_ushll4s_neg_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushll4s_neg_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushr v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll4s_neg_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -2140,35 +2417,52 @@ define <4 x i32> @neon.ushll4s_neg_constant_shift(ptr %A) nounwind {
 }
 
 ; FIXME: should be constant folded.
-define <4 x i32> @neon.ushll4s_constant_fold() nounwind {
-; CHECK-LABEL: neon.ushll4s_constant_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI160_0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI160_0]
-; CHECK-NEXT:    add.4s v0, v0, v0
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_ushll4s_constant_fold() nounwind {
+; CHECK-SD-LABEL: neon_ushll4s_constant_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI160_0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI160_0]
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll4s_constant_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    adrp x8, .LCPI160_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI160_0]
+; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <2 x i64> @neon.ushll2d_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushll2d_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ushll.2d v0, v0, #1
-; CHECK-NEXT:    ret
+define <2 x i64> @neon_ushll2d_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_ushll2d_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_ushll2d_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI161_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI161_0]
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %tmp2, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
-define <1 x i64> @neon.ushl_vscalar_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl_vscalar_constant_shift:
+define <1 x i64> @neon_ushl_vscalar_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_ushl_vscalar_constant_shift:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    zip1.2s v0, v0, v1
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    shl d0, d0, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <1 x i32>, ptr %A
@@ -2177,8 +2471,8 @@ define <1 x i64> @neon.ushl_vscalar_constant_shift(ptr %A) nounwind {
   ret <1 x i64> %tmp3
 }
 
-define i64 @neon.ushl_scalar_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.ushl_scalar_constant_shift:
+define i64 @neon_ushl_scalar_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_ushl_scalar_constant_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    fmov d0, x8
@@ -2195,7 +2489,7 @@ define <8 x i16> @sshll8h(ptr %A) nounwind {
 ; CHECK-LABEL: sshll8h:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.8h v0, v0, #1
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -2207,7 +2501,7 @@ define <2 x i64> @sshll2d(ptr %A) nounwind {
 ; CHECK-LABEL: sshll2d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.2d v0, v0, #1
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
@@ -2222,85 +2516,156 @@ declare <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64>, <2 x i64>)
 declare <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64>, <1 x i64>)
 declare i64 @llvm.aarch64.neon.sshl.i64(i64, i64)
 
-define <16 x i8> @neon.sshl16b_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl16b_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    add.16b v0, v0, v0
-; CHECK-NEXT:    ret
+define <16 x i8> @neon_sshl16b_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl16b_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl16b_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.16b, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i8> %tmp2
 }
 
-define <16 x i8> @neon.sshl16b_non_splat_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl16b_non_splat_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI167_0
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI167_0]
-; CHECK-NEXT:    sshl.16b v0, v0, v1
-; CHECK-NEXT:    ret
+define <16 x i8> @neon_sshl16b_non_splat_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl16b_non_splat_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI167_0
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI167_0]
+; CHECK-SD-NEXT:    sshl v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl16b_non_splat_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI167_0
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI167_0]
+; CHECK-GI-NEXT:    sshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 6, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i8> %tmp2
 }
 
-define <16 x i8> @neon.sshl16b_neg_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl16b_neg_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sshr.16b v0, v0, #2
-; CHECK-NEXT:    ret
+define <16 x i8> @neon_sshl16b_neg_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl16b_neg_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sshr v0.16b, v0.16b, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl16b_neg_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.16b, #254
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2>)
   ret <16 x i8> %tmp2
 }
 
-define <8 x i16> @neon.sshll8h_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll8h_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.8h v0, v0, #1
-; CHECK-NEXT:    ret
+define <8 x i16> @neon_sshll8h_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshll8h_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshll8h_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.8h, #1
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %tmp2, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
-define <4 x i32> @neon.sshl4s_wrong_ext_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl4s_wrong_ext_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    sshll.8h v0, v0, #0
-; CHECK-NEXT:    sshll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshl4s_wrong_ext_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl4s_wrong_ext_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl4s_wrong_ext_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    sxtb w8, w8
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v1.b[3]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w10, s3
+; CHECK-GI-NEXT:    fmov w11, s4
+; CHECK-GI-NEXT:    sxtb w9, w9
+; CHECK-GI-NEXT:    sxtb w10, w10
+; CHECK-GI-NEXT:    sxtb w11, w11
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v1.h[1], w10
+; CHECK-GI-NEXT:    mov v2.h[1], w11
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-GI-NEXT:    sshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i8>, ptr %A
   %tmp2 = sext <4 x i8> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <4 x i32> @neon.sshll4s_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll4s_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshll4s_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshll4s_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshll4s_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <4 x i32> @neon.sshll4s_neg_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll4s_neg_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    sshr.4s v0, v0, #1
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshll4s_neg_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshll4s_neg_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshll4s_neg_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
@@ -2308,46 +2673,70 @@ define <4 x i32> @neon.sshll4s_neg_constant_shift(ptr %A) nounwind {
 }
 
 ; FIXME: should be constant folded.
-define <4 x i32> @neon.sshl4s_constant_fold() nounwind {
-; CHECK-LABEL: neon.sshl4s_constant_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI173_0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI173_0]
-; CHECK-NEXT:    shl.4s v0, v0, #2
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshl4s_constant_fold() nounwind {
+; CHECK-SD-LABEL: neon_sshl4s_constant_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI173_0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI173_0]
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl4s_constant_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #2
+; CHECK-GI-NEXT:    adrp x8, .LCPI173_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI173_0]
+; CHECK-GI-NEXT:    sshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
   ret <4 x i32> %tmp3
 }
 
-define <4 x i32> @neon.sshl4s_no_fold(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl4s_no_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    add.4s v0, v0, v0
-; CHECK-NEXT:    ret
+define <4 x i32> @neon_sshl4s_no_fold(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl4s_no_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl4s_no_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
-define <2 x i64> @neon.sshll2d_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll2d_constant_shift:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sshll.2d v0, v0, #1
-; CHECK-NEXT:    ret
+define <2 x i64> @neon_sshll2d_constant_shift(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshll2d_constant_shift:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshll2d_constant_shift:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI175_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI175_0]
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %tmp2, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
-define <1 x i64> @neon.sshll_vscalar_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll_vscalar_constant_shift:
+define <1 x i64> @neon_sshll_vscalar_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_sshll_vscalar_constant_shift:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    zip1.2s v0, v0, v1
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    shl d0, d0, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <1 x i32>, ptr %A
@@ -2356,8 +2745,8 @@ define <1 x i64> @neon.sshll_vscalar_constant_shift(ptr %A) nounwind {
   ret <1 x i64> %tmp3
 }
 
-define i64 @neon.sshll_scalar_constant_shift(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll_scalar_constant_shift:
+define i64 @neon_sshll_scalar_constant_shift(ptr %A) nounwind {
+; CHECK-LABEL: neon_sshll_scalar_constant_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    fmov d0, x8
@@ -2370,8 +2759,8 @@ define i64 @neon.sshll_scalar_constant_shift(ptr %A) nounwind {
   ret i64 %tmp3
 }
 
-define i64 @neon.sshll_scalar_constant_shift_m1(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshll_scalar_constant_shift_m1:
+define i64 @neon_sshll_scalar_constant_shift_m1(ptr %A) nounwind {
+; CHECK-LABEL: neon_sshll_scalar_constant_shift_m1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    fmov d0, x8
@@ -2385,34 +2774,58 @@ define i64 @neon.sshll_scalar_constant_shift_m1(ptr %A) nounwind {
 }
 
 ; FIXME: should be constant folded.
-define <2 x i64> @neon.sshl2d_constant_fold() nounwind {
-; CHECK-LABEL: neon.sshl2d_constant_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI179_0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI179_0]
-; CHECK-NEXT:    add.2d v0, v0, v0
-; CHECK-NEXT:    ret
+define <2 x i64> @neon_sshl2d_constant_fold() nounwind {
+; CHECK-SD-LABEL: neon_sshl2d_constant_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI179_0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI179_0]
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl2d_constant_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI179_1
+; CHECK-GI-NEXT:    adrp x9, .LCPI179_0
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI179_1]
+; CHECK-GI-NEXT:    ldr q1, [x9, :lo12:.LCPI179_0]
+; CHECK-GI-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> <i64 99, i64 1000>, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
-define <2 x i64> @neon.sshl2d_no_fold(ptr %A) nounwind {
-; CHECK-LABEL: neon.sshl2d_no_fold:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    shl.2d v0, v0, #2
-; CHECK-NEXT:    ret
+define <2 x i64> @neon_sshl2d_no_fold(ptr %A) nounwind {
+; CHECK-SD-LABEL: neon_sshl2d_no_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: neon_sshl2d_no_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI180_0
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI180_0]
+; CHECK-GI-NEXT:    sshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp2 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %tmp2, <2 x i64> <i64 2, i64 2>)
   ret <2 x i64> %tmp3
 }
 
 define <8 x i16> @sshll2_8h(ptr %A) nounwind {
-; CHECK-LABEL: sshll2_8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    sshll.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sshll2_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sshll2_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <16 x i8>, ptr %A
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -2421,11 +2834,18 @@ define <8 x i16> @sshll2_8h(ptr %A) nounwind {
 }
 
 define <4 x i32> @sshll2_4s(ptr %A) nounwind {
-; CHECK-LABEL: sshll2_4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    sshll.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sshll2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sshll2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <8 x i16>, ptr %A
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
@@ -2434,11 +2854,18 @@ define <4 x i32> @sshll2_4s(ptr %A) nounwind {
 }
 
 define <2 x i64> @sshll2_2d(ptr %A) nounwind {
-; CHECK-LABEL: sshll2_2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #8]
-; CHECK-NEXT:    sshll.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sshll2_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0, #8]
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sshll2_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #1
+; CHECK-GI-NEXT:    ret
   %load1 = load <4 x i32>, ptr %A
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
@@ -2447,88 +2874,145 @@ define <2 x i64> @sshll2_2d(ptr %A) nounwind {
 }
 
 define <8 x i8> @sqshli8b(ptr %A) nounwind {
-; CHECK-LABEL: sqshli8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshl.8b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.8b, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8b, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @sqshli4h(ptr %A) nounwind {
-; CHECK-LABEL: sqshli4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshl.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.4h, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4h, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @sqshli2s(ptr %A) nounwind {
-; CHECK-LABEL: sqshli2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    sqshl.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.2s, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2s, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @sqshli16b(ptr %A) nounwind {
-; CHECK-LABEL: sqshli16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshl.16b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.16b, v0.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.16b, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @sqshli8h(ptr %A) nounwind {
-; CHECK-LABEL: sqshli8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshl.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.8h, v0.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8h, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @sqshli4s(ptr %A) nounwind {
-; CHECK-LABEL: sqshli4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshl.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    sqshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @sqshli2d(ptr %A) nounwind {
-; CHECK-LABEL: sqshli2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    sqshl.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshli2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    sqshl v0.2d, v0.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshli2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI190_0
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI190_0]
+; CHECK-GI-NEXT:    sqshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
 define <8 x i8> @uqshli8b(ptr %A) nounwind {
-; CHECK-LABEL: uqshli8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uqshl.8b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.8b, v0.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8b, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <8 x i8> %tmp3
@@ -2537,9 +3021,9 @@ define <8 x i8> @uqshli8b(ptr %A) nounwind {
 define <8 x i8> @uqshli8b_1(ptr %A) nounwind {
 ; CHECK-LABEL: uqshli8b_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.8b v1, #8
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uqshl.8b v0, v0, v1
+; CHECK-NEXT:    movi v0.8b, #8
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    uqshl v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
@@ -2547,78 +3031,130 @@ define <8 x i8> @uqshli8b_1(ptr %A) nounwind {
 }
 
 define <4 x i16> @uqshli4h(ptr %A) nounwind {
-; CHECK-LABEL: uqshli4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uqshl.4h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.4h, v0.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4h, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
   ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @uqshli2s(ptr %A) nounwind {
-; CHECK-LABEL: uqshli2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    uqshl.2s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.2s, v0.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2s, #1
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
   ret <2 x i32> %tmp3
 }
 
 define <16 x i8> @uqshli16b(ptr %A) nounwind {
-; CHECK-LABEL: uqshli16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshl.16b v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.16b, v0.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.16b, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i16> @uqshli8h(ptr %A) nounwind {
-; CHECK-LABEL: uqshli8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshl.8h v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.8h, v0.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.8h, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @uqshli4s(ptr %A) nounwind {
-; CHECK-LABEL: uqshli4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshl.4s v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.4s, v0.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.4s, #1
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    uqshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @uqshli2d(ptr %A) nounwind {
-; CHECK-LABEL: uqshli2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    uqshl.2d v0, v0, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshli2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    uqshl v0.2d, v0.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshli2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI198_0
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI198_0]
+; CHECK-GI-NEXT:    uqshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %tmp3
 }
 
 define <8 x i8> @ursra8b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ursra.8b v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    ursra v0.8b, v1.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %tmp4 = load <8 x i8>, ptr %B
@@ -2627,12 +3163,21 @@ define <8 x i8> @ursra8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @ursra4h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ursra.4h v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    ursra v0.4h, v1.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   %tmp4 = load <4 x i16>, ptr %B
@@ -2641,12 +3186,21 @@ define <4 x i16> @ursra4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @ursra2s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ursra.2s v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    ursra v0.2s, v1.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    urshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
   %tmp4 = load <2 x i32>, ptr %B
@@ -2655,12 +3209,21 @@ define <2 x i32> @ursra2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @ursra16b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ursra.16b v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    ursra v0.16b, v1.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %tmp4 = load <16 x i8>, ptr %B
@@ -2669,12 +3232,21 @@ define <16 x i8> @ursra16b(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @ursra8h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ursra.8h v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    ursra v0.8h, v1.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   %tmp4 = load <8 x i16>, ptr %B
@@ -2683,12 +3255,21 @@ define <8 x i16> @ursra8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @ursra4s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ursra.4s v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    ursra v0.4s, v1.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   %tmp4 = load <4 x i32>, ptr %B
@@ -2697,12 +3278,21 @@ define <4 x i32> @ursra4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @ursra2d(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: ursra2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ursra.2d v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ursra2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    ursra v0.2d, v1.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ursra2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    urshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
   %tmp4 = load <2 x i64>, ptr %B
@@ -2740,12 +3330,21 @@ define i64 @ursra_scalar(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i8> @srsra8b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    srsra.8b v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    srsra v0.8b, v1.8b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %tmp4 = load <8 x i8>, ptr %B
@@ -2754,12 +3353,21 @@ define <8 x i8> @srsra8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @srsra4h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    srsra.4h v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    srsra v0.4h, v1.4h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
   %tmp4 = load <4 x i16>, ptr %B
@@ -2768,12 +3376,21 @@ define <4 x i16> @srsra4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @srsra2s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    srsra.2s v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    srsra v0.2s, v1.2s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi d0, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    srshl v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
   %tmp4 = load <2 x i32>, ptr %B
@@ -2782,12 +3399,21 @@ define <2 x i32> @srsra2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @srsra16b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    srsra.16b v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    srsra v0.16b, v1.16b, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %tmp4 = load <16 x i8>, ptr %B
@@ -2796,12 +3422,21 @@ define <16 x i8> @srsra16b(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @srsra8h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    srsra.8h v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    srsra v0.8h, v1.8h, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
   %tmp4 = load <8 x i16>, ptr %B
@@ -2810,12 +3445,21 @@ define <8 x i16> @srsra8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @srsra4s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    srsra.4s v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    srsra v0.4s, v1.4s, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
   %tmp4 = load <4 x i32>, ptr %B
@@ -2824,12 +3468,21 @@ define <4 x i32> @srsra4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @srsra2d(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: srsra2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    srsra.2d v0, v1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srsra2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q0, [x1]
+; CHECK-SD-NEXT:    srsra v0.2d, v1.2d, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srsra2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    srshl v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
   %tmp4 = load <2 x i64>, ptr %B
@@ -2871,7 +3524,7 @@ define <8 x i8> @usra8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    usra.8b v0, v1, #1
+; CHECK-NEXT:    usra v0.8b, v1.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -2885,7 +3538,7 @@ define <4 x i16> @usra4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    usra.4h v0, v1, #1
+; CHECK-NEXT:    usra v0.4h, v1.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
@@ -2899,7 +3552,7 @@ define <2 x i32> @usra2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    usra.2s v0, v1, #1
+; CHECK-NEXT:    usra v0.2s, v1.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
@@ -2913,7 +3566,7 @@ define <16 x i8> @usra16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    usra.16b v0, v1, #1
+; CHECK-NEXT:    usra v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -2927,7 +3580,7 @@ define <8 x i16> @usra8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    usra.8h v0, v1, #1
+; CHECK-NEXT:    usra v0.8h, v1.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -2941,7 +3594,7 @@ define <4 x i32> @usra4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    usra.4s v0, v1, #1
+; CHECK-NEXT:    usra v0.4s, v1.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
@@ -2955,7 +3608,7 @@ define <2 x i64> @usra2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    usra.2d v0, v1, #1
+; CHECK-NEXT:    usra v0.2d, v1.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
@@ -2965,12 +3618,20 @@ define <2 x i64> @usra2d(ptr %A, ptr %B) nounwind {
 }
 
 define <1 x i64> @usra1d(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: usra1d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    usra d0, d1, #1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: usra1d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d0, [x1]
+; CHECK-SD-NEXT:    usra d0, d1, #1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: usra1d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr x8, [x0]
+; CHECK-GI-NEXT:    ldr x9, [x1]
+; CHECK-GI-NEXT:    add x8, x9, x8, lsr #1
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <1 x i64>, ptr %A
   %tmp3 = lshr <1 x i64> %tmp1, <i64 1>
   %tmp4 = load <1 x i64>, ptr %B
@@ -2983,7 +3644,7 @@ define <8 x i8> @ssra8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ssra.8b v0, v1, #1
+; CHECK-NEXT:    ssra v0.8b, v1.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -2997,7 +3658,7 @@ define <4 x i16> @ssra4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ssra.4h v0, v1, #1
+; CHECK-NEXT:    ssra v0.4h, v1.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
@@ -3011,7 +3672,7 @@ define <2 x i32> @ssra2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ssra.2s v0, v1, #1
+; CHECK-NEXT:    ssra v0.2s, v1.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
@@ -3025,7 +3686,7 @@ define <16 x i8> @ssra16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ssra.16b v0, v1, #1
+; CHECK-NEXT:    ssra v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -3039,7 +3700,7 @@ define <8 x i16> @ssra8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ssra.8h v0, v1, #1
+; CHECK-NEXT:    ssra v0.8h, v1.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -3053,7 +3714,7 @@ define <4 x i32> @ssra4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ssra.4s v0, v1, #1
+; CHECK-NEXT:    ssra v0.4s, v1.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
@@ -3067,7 +3728,7 @@ define <2 x i64> @ssra2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    ssra.2d v0, v1, #1
+; CHECK-NEXT:    ssra v0.2d, v1.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
@@ -3081,8 +3742,8 @@ define <8 x i8> @shr_orr8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ushr.8b v0, v0, #1
-; CHECK-NEXT:    orr.8b v0, v0, v1
+; CHECK-NEXT:    ushr v0.8b, v0.8b, #1
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp4 = load <8 x i8>, ptr %B
@@ -3096,8 +3757,8 @@ define <4 x i16> @shr_orr4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ushr.4h v0, v0, #1
-; CHECK-NEXT:    orr.8b v0, v0, v1
+; CHECK-NEXT:    ushr v0.4h, v0.4h, #1
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp4 = load <4 x i16>, ptr %B
@@ -3111,8 +3772,8 @@ define <2 x i32> @shr_orr2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ushr.2s v0, v0, #1
-; CHECK-NEXT:    orr.8b v0, v0, v1
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #1
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp4 = load <2 x i32>, ptr %B
@@ -3126,8 +3787,8 @@ define <16 x i8> @shr_orr16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ushr.16b v0, v0, #1
-; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:    ushr v0.16b, v0.16b, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp4 = load <16 x i8>, ptr %B
@@ -3141,8 +3802,8 @@ define <8 x i16> @shr_orr8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ushr.8h v0, v0, #1
-; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp4 = load <8 x i16>, ptr %B
@@ -3156,8 +3817,8 @@ define <4 x i32> @shr_orr4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ushr.4s v0, v0, #1
-; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp4 = load <4 x i32>, ptr %B
@@ -3171,8 +3832,8 @@ define <2 x i64> @shr_orr2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ushr.2d v0, v0, #1
-; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:    ushr v0.2d, v0.2d, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp4 = load <2 x i64>, ptr %B
@@ -3182,13 +3843,21 @@ define <2 x i64> @shr_orr2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i8> @shl_orr8b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    add.8b v0, v0, v0
-; CHECK-NEXT:    orr.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1]
+; CHECK-SD-NEXT:    add v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    shl v0.8b, v0.8b, #1
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp4 = load <8 x i8>, ptr %B
   %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -3197,13 +3866,21 @@ define <8 x i8> @shl_orr8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @shl_orr4h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr4h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    add.4h v0, v0, v0
-; CHECK-NEXT:    orr.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr4h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1]
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr4h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #1
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp4 = load <4 x i16>, ptr %B
   %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
@@ -3212,13 +3889,21 @@ define <4 x i16> @shl_orr4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @shl_orr2s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr2s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    add.2s v0, v0, v0
-; CHECK-NEXT:    orr.8b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr2s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1]
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr2s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #1
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp4 = load <2 x i32>, ptr %B
   %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
@@ -3227,13 +3912,21 @@ define <2 x i32> @shl_orr2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @shl_orr16b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    add.16b v0, v0, v0
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    shl v0.16b, v0.16b, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp4 = load <16 x i8>, ptr %B
   %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -3242,13 +3935,21 @@ define <16 x i8> @shl_orr16b(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @shl_orr8h(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr8h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    add.8h v0, v0, v0
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp4 = load <8 x i16>, ptr %B
   %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -3257,13 +3958,21 @@ define <8 x i16> @shl_orr8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @shl_orr4s(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr4s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    add.4s v0, v0, v0
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp4 = load <4 x i32>, ptr %B
   %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
@@ -3272,13 +3981,21 @@ define <4 x i32> @shl_orr4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @shl_orr2d(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: shl_orr2d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    add.2d v0, v0, v0
-; CHECK-NEXT:    orr.16b v0, v0, v1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_orr2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v0.2d
+; CHECK-SD-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_orr2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp4 = load <2 x i64>, ptr %B
   %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
@@ -3287,20 +4004,32 @@ define <2 x i64> @shl_orr2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @shll(<8 x i8> %in) {
-; CHECK-LABEL: shll:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shll.8h v0, v0, #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shll:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shll:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #8
+; CHECK-GI-NEXT:    ret
   %ext = zext <8 x i8> %in to <8 x i16>
   %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   ret <8 x i16> %res
 }
 
 define <4 x i32> @shll_high(<8 x i16> %in) {
-; CHECK-LABEL: shll_high:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shll2.4s v0, v0, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shll_high:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shll_high:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #16
+; CHECK-GI-NEXT:    ret
   %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %ext = zext <4 x i16> %extract to <4 x i32>
   %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
@@ -3312,7 +4041,7 @@ define <8 x i8> @sli8b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sli.8b v0, v1, #1
+; CHECK-NEXT:    sli v0.8b, v1.8b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
@@ -3325,7 +4054,7 @@ define <4 x i16> @sli4h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sli.4h v0, v1, #1
+; CHECK-NEXT:    sli v0.4h, v1.4h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -3338,7 +4067,7 @@ define <2 x i32> @sli2s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sli.2s v0, v1, #1
+; CHECK-NEXT:    sli v0.2s, v1.2s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -3364,7 +4093,7 @@ define <16 x i8> @sli16b(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sli.16b v0, v1, #1
+; CHECK-NEXT:    sli v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
@@ -3377,7 +4106,7 @@ define <8 x i16> @sli8h(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sli.8h v0, v1, #1
+; CHECK-NEXT:    sli v0.8h, v1.8h, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
@@ -3390,7 +4119,7 @@ define <4 x i32> @sli4s(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sli.4s v0, v1, #1
+; CHECK-NEXT:    sli v0.4s, v1.4s, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
@@ -3403,7 +4132,7 @@ define <2 x i64> @sli2d(ptr %A, ptr %B) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sli.2d v0, v1, #1
+; CHECK-NEXT:    sli v0.2d, v1.2d, #1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i64>, ptr %B
@@ -3422,21 +4151,37 @@ declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounw
 declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
 
 define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: ashr_v1i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg d1, d1
-; CHECK-NEXT:    sshl d0, d0, d1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ashr_v1i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg d1, d1
+; CHECK-SD-NEXT:    sshl d0, d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v1i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    asr x8, x8, x9
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
   %c = ashr <1 x i64> %a, %b
   ret <1 x i64> %c
 }
 
 define void @sqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: sqshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sqshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sqshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    sqshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3445,11 +4190,19 @@ entry:
 }
 
 define void @uqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: uqshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uqshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uqshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    uqshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3458,11 +4211,19 @@ entry:
 }
 
 define void @srshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: srshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: srshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: srshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    srshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3471,11 +4232,19 @@ entry:
 }
 
 define void @urshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: urshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: urshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: urshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    urshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3486,8 +4255,8 @@ entry:
 define void @sqshlu_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
 ; CHECK-LABEL: sqshlu_zero_shift_amount:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    sqshlu.2d v0, v0, #0
+; CHECK-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    sqshlu v0.2d, v0.2d, #0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
 entry:
@@ -3498,11 +4267,19 @@ entry:
 }
 
 define void @sshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: sshl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sshl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sshl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    sshl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3511,11 +4288,19 @@ entry:
 }
 
 define void @ushl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) {
-; CHECK-LABEL: ushl_zero_shift_amount:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addp.2d v0, v0, v1
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ushl_zero_shift_amount:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ushl_zero_shift_amount:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vpaddq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
   %vshlq_v2.i.i = tail call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %vpaddq_v2.i.i, <2 x i64> zeroinitializer)
@@ -3526,8 +4311,8 @@ entry:
 define <4 x i32> @sext_rshrn(<4 x i32> noundef %a) {
 ; CHECK-LABEL: sext_rshrn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rshrn.4h v0, v0, #13
-; CHECK-NEXT:    sshll.4s v0, v0, #0
+; CHECK-NEXT:    rshrn v0.4h, v0.4s, #13
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13)
@@ -3538,8 +4323,8 @@ entry:
 define <4 x i32> @zext_rshrn(<4 x i32> noundef %a) {
 ; CHECK-LABEL: zext_rshrn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rshrn.4h v0, v0, #13
-; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    rshrn v0.4h, v0.4s, #13
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13)
@@ -3550,9 +4335,9 @@ entry:
 define <4 x i16> @mul_rshrn(<4 x i32> noundef %a) {
 ; CHECK-LABEL: mul_rshrn:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi.4s v1, #3
-; CHECK-NEXT:    add.4s v0, v0, v1
-; CHECK-NEXT:    rshrn.4h v0, v0, #13
+; CHECK-NEXT:    movi v1.4s, #3
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    rshrn v0.4h, v0.4s, #13
 ; CHECK-NEXT:    ret
 entry:
   %b = add <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
@@ -3561,15 +4346,61 @@ entry:
 }
 
 define <8 x i16> @signbits_vashr(<8 x i16> %a)  {
-; CHECK-LABEL: signbits_vashr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshr.8h v0, v0, #8
-; CHECK-NEXT:    sshr.8h v0, v0, #9
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: signbits_vashr:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshr v0.8h, v0.8h, #8
+; CHECK-SD-NEXT:    sshr v0.8h, v0.8h, #9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: signbits_vashr:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mvni v1.8h, #7
+; CHECK-GI-NEXT:    mvni v2.8h, #8
+; CHECK-GI-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sshl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #7
+; CHECK-GI-NEXT:    ret
   %b = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %a, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
   %c = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %b, <8 x i16> <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>)
   %d = ashr <8 x i16> %c, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   ret <8 x i16> %d
 }
 
+define <2 x i8> @lshr_trunc_v2i64_v2i8(<2 x i64> %a) {
+; CHECK-LABEL: lshr_trunc_v2i64_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #16
+; CHECK-NEXT:    ret
+  %b = lshr <2 x i64> %a, <i64 16, i64 16>
+  %c = trunc <2 x i64> %b to <2 x i8>
+  ret <2 x i8> %c
+}
+
+define <2 x i8> @ashr_trunc_v2i64_v2i8(<2 x i64> %a) {
+; CHECK-LABEL: ashr_trunc_v2i64_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #16
+; CHECK-NEXT:    ret
+  %b = ashr <2 x i64> %a, <i64 16, i64 16>
+  %c = trunc <2 x i64> %b to <2 x i8>
+  ret <2 x i8> %c
+}
+
+define <2 x i8> @shl_trunc_v2i64_v2i8(<2 x i64> %a) {
+; CHECK-SD-LABEL: shl_trunc_v2i64_v2i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_trunc_v2i64_v2i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #16
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+  %b = shl <2 x i64> %a, <i64 16, i64 16>
+  %c = trunc <2 x i64> %b to <2 x i8>
+  ret <2 x i8> %c
+}
+
 declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)


        


More information about the llvm-commits mailing list