[llvm] af42d80 - [AArch64] Unit test for zext lowering for different types of vectors

Nilanjana Basu via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 9 00:51:11 PST 2022


Author: Nilanjana Basu
Date: 2022-12-09T13:55:25+05:30
New Revision: af42d80a63e38a941bc0d9df8e4d6ae287f75566

URL: https://github.com/llvm/llvm-project/commit/af42d80a63e38a941bc0d9df8e4d6ae287f75566
DIFF: https://github.com/llvm/llvm-project/commit/af42d80a63e38a941bc0d9df8e4d6ae287f75566.diff

LOG: [AArch64] Unit test for zext lowering for different types of vectors

These are unit tests for code generated for zext lowering of vectors.
There are tests for different types of vectors, as well as cases where multiple 'zext' instructions of same type occur back-to-back.
Testing of Global-ISel path is added.
This will help in comparing changes in zext lowering in https://reviews.llvm.org/D136722.

Reviewed By: fhahn

Differential Revision: https://reviews.llvm.org/D137993

Added: 
    

Modified: 
    llvm/test/CodeGen/AArch64/zext-to-tbl.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 232777936ae55..cfaa369d9d924 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -mtriple=arm64-apple-ios -mattr=+global-isel -mattr=+sve -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+global-isel -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s
 
 ; CHECK-LABEL: lCPI0_0:
 ; CHECK-NEXT:    .byte   0                               ; 0x0
@@ -792,12 +794,1060 @@ exit:
   ret void
 }
 
+define void @zext_v8i8_to_v8i64_in_loop(i8* %src, i64* %dst) {
+; CHECK-LABEL: _zext_v8i8_to_v8i64_in_loop:
+; CHECK: ; %bb.0:                                ; %entry
+; CHECK-NEXT:  	mov	x8, xzr
+; CHECK-NEXT:  LBB8_1:                                 ; %loop
+; CHECK-NEXT:                                          ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:  	ldr	d0, [x0, x8]
+; CHECK-NEXT:  	add	x8, x8, #16
+; CHECK-NEXT:  	cmp	x8, #128
+; CHECK-NEXT:  	ushll.8h	v0, v0, #0
+; CHECK-NEXT:  	ushll2.4s	v1, v0, #0
+; CHECK-NEXT:  	ushll.4s	v0, v0, #0
+; CHECK-NEXT:  	ushll2.2d	v2, v1, #0
+; CHECK-NEXT:  	ushll.2d	v1, v1, #0
+; CHECK-NEXT:  	ushll2.2d	v3, v0, #0
+; CHECK-NEXT:  	ushll.2d	v0, v0, #0
+; CHECK-NEXT:  	stp	q1, q2, [x1, #32]
+; CHECK-NEXT:  	stp	q0, q3, [x1], #128
+; CHECK-NEXT:  	b.ne	LBB8_1
+; CHECK-NEXT:  ; %bb.2:                                ; %exit
+; CHECK-NEXT:  	ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i64_in_loop:
+; CHECK-BE: // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB8_1:                                // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x10, x1, #32
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	ld1	{ v0.8b }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #48
+; CHECK-BE-NEXT:  	ushll	v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	ushll2	v1.4s, v0.8h, #0
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	ushll2	v2.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	ushll	v1.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v2.2d }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #16
+; CHECK-BE-NEXT:  	ushll	v2.2d, v0.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v1.2d }, [x10]
+; CHECK-BE-NEXT:  	ushll2	v0.2d, v0.4s, #0
+; CHECK-BE-NEXT:  	st1	{ v2.2d }, [x1]
+; CHECK-BE-NEXT:  	add	x1, x1, #128
+; CHECK-BE-NEXT:  	st1	{ v0.2d }, [x9]
+; CHECK-BE-NEXT:  	b.ne	.LBB8_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+  %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+  %ext = zext <8 x i8> %load to <8 x i64>
+  %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+  %dst.gep.cast = bitcast i64* %dst.gep to <8 x i64>*
+  store <8 x i64> %ext, <8 x i64>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v8i8_to_v8i16_in_loop(i8* %src, i16* %dst) {
+; CHECK-LABEL:  _zext_v8i8_to_v8i16_in_loop:            ; @zext_v8i8_to_v8i16_in_loop
+; CHECK:  ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB9_1:                                 ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	ldr	d0, [x0, x8]
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	ushll.8h	v0, v0, #0
+; CHECK-NEXT: 	str	q0, [x1], #32
+; CHECK-NEXT: 	b.ne	LBB9_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL:  zext_v8i8_to_v8i16_in_loop:             // @zext_v8i8_to_v8i16_in_loop
+; CHECK-BE:  	.cfi_startproc
+; CHECK-BE-NEXT:  // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB9_1:                                // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	ld1	{ v0.8b }, [x9]
+; CHECK-BE-NEXT:  	ushll	v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	st1	{ v0.8h }, [x1]
+; CHECK-BE-NEXT:  	add	x1, x1, #32
+; CHECK-BE-NEXT:  	b.ne	.LBB9_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+  %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+  %ext = zext <8 x i8> %load to <8 x i16>
+  %dst.gep = getelementptr i16, i16* %dst, i64 %iv
+  %dst.gep.cast = bitcast i16* %dst.gep to <8 x i16>*
+  store <8 x i16> %ext, <8 x i16>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v8i8_to_v8i20_in_loop(i8* %src, i20* %dst) {
+; CHECK-LABEL:  _zext_v8i8_to_v8i20_in_loop:
+; CHECK:  ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB10_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	ldr	d0, [x0, x8]
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	ushll.8h	v0, v0, #0
+; CHECK-NEXT: 	ushll2.4s	v1, v0, #0
+; CHECK-NEXT: 	ushll.4s	v0, v0, #0
+; CHECK-NEXT: 	mov.s	w10, v1[1]
+; CHECK-NEXT: 	mov.s	w13, v0[1]
+; CHECK-NEXT: 	fmov	w11, s1
+; CHECK-NEXT: 	mov.s	w12, v1[2]
+; CHECK-NEXT: 	fmov	w15, s0
+; CHECK-NEXT: 	mov.s	w16, v0[2]
+; CHECK-NEXT: 	mov.s	w9, v1[3]
+; CHECK-NEXT: 	mov.s	w14, v0[3]
+; CHECK-NEXT: 	orr	x10, x11, x10, lsl #20
+; CHECK-NEXT: 	orr	x11, x15, x13, lsl #20
+; CHECK-NEXT: 	orr	x10, x10, x12, lsl #40
+; CHECK-NEXT: 	orr	x11, x11, x16, lsl #40
+; CHECK-NEXT: 	lsr	x13, x9, #4
+; CHECK-NEXT: 	lsr	x12, x14, #4
+; CHECK-NEXT: 	orr	x9, x10, x9, lsl #60
+; CHECK-NEXT: 	orr	x10, x11, x14, lsl #60
+; CHECK-NEXT: 	strh	w13, [x1, #18]
+; CHECK-NEXT: 	strh	w12, [x1, #8]
+; CHECK-NEXT: 	stur	x9, [x1, #10]
+; CHECK-NEXT: 	str	x10, [x1], #64
+; CHECK-NEXT: 	b.ne	LBB10_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i20_in_loop:
+; CHECK-BE: // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB10_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	ld1	{ v0.8b }, [x9]
+; CHECK-BE-NEXT:  	ushll	v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	ushll2	v1.4s, v0.8h, #0
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	mov	w9, v1.s[1]
+; CHECK-BE-NEXT:  	mov	w11, v0.s[1]
+; CHECK-BE-NEXT:  	mov	w13, v1.s[2]
+; CHECK-BE-NEXT:  	fmov	w14, s1
+; CHECK-BE-NEXT:  	mov	w15, v0.s[2]
+; CHECK-BE-NEXT:  	fmov	w16, s0
+; CHECK-BE-NEXT:  	mov	w10, v1.s[3]
+; CHECK-BE-NEXT:  	lsl	x9, x9, #40
+; CHECK-BE-NEXT:  	mov	w12, v0.s[3]
+; CHECK-BE-NEXT:  	lsl	x11, x11, #40
+; CHECK-BE-NEXT:  	orr	x9, x9, x14, lsl #60
+; CHECK-BE-NEXT:  	orr	x11, x11, x16, lsl #60
+; CHECK-BE-NEXT:  	orr	x9, x9, x13, lsl #20
+; CHECK-BE-NEXT:  	orr	x11, x11, x15, lsl #20
+; CHECK-BE-NEXT:  	lsr	w13, w14, #4
+; CHECK-BE-NEXT:  	lsr	w14, w16, #4
+; CHECK-BE-NEXT:  	strh	w10, [x1, #18]
+; CHECK-BE-NEXT:  	extr	x9, x13, x9, #16
+; CHECK-BE-NEXT:  	strh	w12, [x1, #8]
+; CHECK-BE-NEXT:  	extr	x10, x14, x11, #16
+; CHECK-BE-NEXT:  	stur	x9, [x1, #10]
+; CHECK-BE-NEXT:  	str	x10, [x1], #64
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+  %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+  %ext = zext <8 x i8> %load to <8 x i20>
+  %dst.gep = getelementptr i20, i20* %dst, i64 %iv
+  %dst.gep.cast = bitcast i20* %dst.gep to <8 x i20>*
+  store <8 x i20> %ext, <8 x i20>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v4i8_to_v4i32_in_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL:  _zext_v4i8_to_v4i32_in_loop:
+; CHECK:  ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB11_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	ldr	s0, [x0, x8]
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	ushll.8h	v0, v0, #0
+; CHECK-NEXT: 	ushll.4s	v0, v0, #0
+; CHECK-NEXT: 	str	q0, [x1], #64
+; CHECK-NEXT: 	b.ne	LBB11_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop:
+; CHECK-BE: // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB11_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	ldr	s0, [x0, x8]
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	rev32	v0.8b, v0.8b
+; CHECK-BE-NEXT:  	ushll	v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	st1	{ v0.4s }, [x1]
+; CHECK-BE-NEXT:  	add	x1, x1, #64
+; CHECK-BE-NEXT:  	b.ne	.LBB11_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <4 x i8>*
+  %load = load <4 x i8>, <4 x i8>* %src.gep.cast
+  %ext = zext <4 x i8> %load to <4 x i32>
+  %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+  %dst.gep.cast = bitcast i32* %dst.gep to <4 x i32>*
+  store <4 x i32> %ext, <4 x i32>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v12i8_to_v12i32_in_loop(i8* %src, i32* %dst) {
+; CHECK-LABEL:  _zext_v12i8_to_v12i32_in_loop:
+; CHECK:  ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB12_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	ldr	q0, [x0, x8]
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	ushll2.8h	v1, v0, #0
+; CHECK-NEXT: 	ushll.8h	v0, v0, #0
+; CHECK-NEXT: 	ushll.4s	v1, v1, #0
+; CHECK-NEXT: 	ushll2.4s	v2, v0, #0
+; CHECK-NEXT: 	ushll.4s	v0, v0, #0
+; CHECK-NEXT: 	stp	q2, q1, [x1, #16]
+; CHECK-NEXT: 	str	q0, [x1], #64
+; CHECK-NEXT: 	b.ne	LBB12_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop:
+; CHECK-BE: // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB12_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x10, x1, #16
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	ld1	{ v0.16b }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #32
+; CHECK-BE-NEXT:  	ushll	v1.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	ushll2	v0.8h, v0.16b, #0
+; CHECK-BE-NEXT:  	ushll	v2.4s, v1.4h, #0
+; CHECK-BE-NEXT:  	ushll2	v1.4s, v1.8h, #0
+; CHECK-BE-NEXT:  	st1	{ v2.4s }, [x1]
+; CHECK-BE-NEXT:  	add	x1, x1, #64
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	st1	{ v1.4s }, [x10]
+; CHECK-BE-NEXT:  	st1	{ v0.4s }, [x9]
+; CHECK-BE-NEXT:  	b.ne	.LBB12_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <12 x i8>*
+  %load = load <12 x i8>, <12 x i8>* %src.gep.cast
+  %ext = zext <12 x i8> %load to <12 x i32>
+  %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+  %dst.gep.cast = bitcast i32* %dst.gep to <12 x i32>*
+  store <12 x i32> %ext, <12 x i32>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v16i4_to_v16i32_in_loop(i4* %src, i32* %dst) {
+; CHECK-LABEL:  _zext_v16i4_to_v16i32_in_loop:
+; CHECK:  ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	movi.4s	v0, #15
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB13_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	ldr	x9, [x0, x8]
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	and	w10, w9, #0xf
+; CHECK-NEXT: 	ubfx	w11, w9, #4, #4
+; CHECK-NEXT: 	fmov	s1, w10
+; CHECK-NEXT: 	ubfx	w10, w9, #8, #4
+; CHECK-NEXT: 	mov.b	v1[1], w11
+; CHECK-NEXT: 	mov.b	v1[2], w10
+; CHECK-NEXT: 	ubfx	w10, w9, #12, #4
+; CHECK-NEXT: 	mov.b	v1[3], w10
+; CHECK-NEXT: 	ubfx	w10, w9, #16, #4
+; CHECK-NEXT: 	mov.b	v1[4], w10
+; CHECK-NEXT: 	ubfx	w10, w9, #20, #4
+; CHECK-NEXT: 	mov.b	v1[5], w10
+; CHECK-NEXT: 	ubfx	w10, w9, #24, #4
+; CHECK-NEXT: 	mov.b	v1[6], w10
+; CHECK-NEXT: 	ubfx	x10, x9, #28, #4
+; CHECK-NEXT: 	mov.b	v1[7], w10
+; CHECK-NEXT: 	ubfx	x10, x9, #32, #4
+; CHECK-NEXT: 	mov.b	v1[8], w10
+; CHECK-NEXT: 	ubfx	x10, x9, #36, #4
+; CHECK-NEXT: 	mov.b	v1[9], w10
+; CHECK-NEXT: 	ubfx	x10, x9, #40, #4
+; CHECK-NEXT: 	mov.b	v1[10], w10
+; CHECK-NEXT: 	ubfx	x10, x9, #44, #4
+; CHECK-NEXT: 	mov.b	v1[11], w10
+; CHECK-NEXT: 	ubfx	x10, x9, #48, #4
+; CHECK-NEXT: 	mov.b	v1[12], w10
+; CHECK-NEXT: 	ubfx	x10, x9, #52, #4
+; CHECK-NEXT: 	mov.b	v1[13], w10
+; CHECK-NEXT: 	ubfx	x10, x9, #56, #4
+; CHECK-NEXT: 	lsr	x9, x9, #60
+; CHECK-NEXT: 	mov.b	v1[14], w10
+; CHECK-NEXT: 	mov.b	v1[15], w9
+; CHECK-NEXT: 	ext.16b	v2, v1, v1, #8
+; CHECK-NEXT: 	zip2.8b	v3, v1, v0
+; CHECK-NEXT: 	zip1.8b	v1, v1, v0
+; CHECK-NEXT: 	zip1.8b	v4, v2, v0
+; CHECK-NEXT: 	zip2.8b	v2, v2, v0
+; CHECK-NEXT: 	ushll.4s	v3, v3, #0
+; CHECK-NEXT: 	ushll.4s	v1, v1, #0
+; CHECK-NEXT: 	and.16b	v3, v3, v0
+; CHECK-NEXT: 	and.16b	v1, v1, v0
+; CHECK-NEXT: 	stp	q1, q3, [x1]
+; CHECK-NEXT: 	ushll.4s	v1, v2, #0
+; CHECK-NEXT: 	ushll.4s	v2, v4, #0
+; CHECK-NEXT: 	and.16b	v1, v1, v0
+; CHECK-NEXT: 	and.16b	v2, v2, v0
+; CHECK-NEXT: 	stp	q2, q1, [x1, #32]
+; CHECK-NEXT: 	add	x1, x1, #64
+; CHECK-NEXT: 	b.ne	LBB13_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL: zext_v16i4_to_v16i32_in_loop:
+; CHECK-BE: // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	movi	v0.4s, #15
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB13_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	ldr	x9, [x0, x8]
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	lsr	x10, x9, #60
+; CHECK-BE-NEXT:  	ubfx	x11, x9, #56, #4
+; CHECK-BE-NEXT:  	fmov	s1, w10
+; CHECK-BE-NEXT:  	ubfx	x10, x9, #52, #4
+; CHECK-BE-NEXT:  	mov	v1.b[1], w11
+; CHECK-BE-NEXT:  	add	x11, x1, #32
+; CHECK-BE-NEXT:  	mov	v1.b[2], w10
+; CHECK-BE-NEXT:  	ubfx	x10, x9, #48, #4
+; CHECK-BE-NEXT:  	mov	v1.b[3], w10
+; CHECK-BE-NEXT:  	ubfx	x10, x9, #44, #4
+; CHECK-BE-NEXT:  	mov	v1.b[4], w10
+; CHECK-BE-NEXT:  	ubfx	x10, x9, #40, #4
+; CHECK-BE-NEXT:  	mov	v1.b[5], w10
+; CHECK-BE-NEXT:  	ubfx	x10, x9, #36, #4
+; CHECK-BE-NEXT:  	mov	v1.b[6], w10
+; CHECK-BE-NEXT:  	ubfx	x10, x9, #32, #4
+; CHECK-BE-NEXT:  	mov	v1.b[7], w10
+; CHECK-BE-NEXT:  	ubfx	x10, x9, #28, #4
+; CHECK-BE-NEXT:  	mov	v1.b[8], w10
+; CHECK-BE-NEXT:  	ubfx	w10, w9, #24, #4
+; CHECK-BE-NEXT:  	mov	v1.b[9], w10
+; CHECK-BE-NEXT:  	ubfx	w10, w9, #20, #4
+; CHECK-BE-NEXT:  	mov	v1.b[10], w10
+; CHECK-BE-NEXT:  	ubfx	w10, w9, #16, #4
+; CHECK-BE-NEXT:  	mov	v1.b[11], w10
+; CHECK-BE-NEXT:  	ubfx	w10, w9, #12, #4
+; CHECK-BE-NEXT:  	mov	v1.b[12], w10
+; CHECK-BE-NEXT:  	ubfx	w10, w9, #8, #4
+; CHECK-BE-NEXT:  	mov	v1.b[13], w10
+; CHECK-BE-NEXT:  	ubfx	w10, w9, #4, #4
+; CHECK-BE-NEXT:  	and	w9, w9, #0xf
+; CHECK-BE-NEXT:  	mov	v1.b[14], w10
+; CHECK-BE-NEXT:  	add	x10, x1, #48
+; CHECK-BE-NEXT:  	mov	v1.b[15], w9
+; CHECK-BE-NEXT:  	add	x9, x1, #16
+; CHECK-BE-NEXT:  	ext	v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:  	zip2	v3.8b, v1.8b, v0.8b
+; CHECK-BE-NEXT:  	zip1	v1.8b, v1.8b, v0.8b
+; CHECK-BE-NEXT:  	zip1	v4.8b, v2.8b, v0.8b
+; CHECK-BE-NEXT:  	zip2	v2.8b, v2.8b, v0.8b
+; CHECK-BE-NEXT:  	rev16	v1.8b, v1.8b
+; CHECK-BE-NEXT:  	rev16	v3.8b, v3.8b
+; CHECK-BE-NEXT:  	rev16	v4.8b, v4.8b
+; CHECK-BE-NEXT:  	rev16	v2.8b, v2.8b
+; CHECK-BE-NEXT:  	ushll	v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:  	ushll	v3.4s, v3.4h, #0
+; CHECK-BE-NEXT:  	and	v1.16b, v1.16b, v0.16b
+; CHECK-BE-NEXT:  	st1	{ v1.4s }, [x1]
+; CHECK-BE-NEXT:  	add	x1, x1, #64
+; CHECK-BE-NEXT:  	ushll	v1.4s, v2.4h, #0
+; CHECK-BE-NEXT:  	ushll	v2.4s, v4.4h, #0
+; CHECK-BE-NEXT:  	and	v3.16b, v3.16b, v0.16b
+; CHECK-BE-NEXT:  	and	v1.16b, v1.16b, v0.16b
+; CHECK-BE-NEXT:  	st1	{ v3.4s }, [x9]
+; CHECK-BE-NEXT:  	and	v2.16b, v2.16b, v0.16b
+; CHECK-BE-NEXT:  	st1	{ v1.4s }, [x10]
+; CHECK-BE-NEXT:  	st1	{ v2.4s }, [x11]
+; CHECK-BE-NEXT:  	b.ne	.LBB13_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i4, i4* %src, i64 %iv
+  %src.gep.cast = bitcast i4* %src.gep to <16 x i4>*
+  %load = load <16 x i4>, <16 x i4>* %src.gep.cast
+  %ext = zext <16 x i4> %load to <16 x i32>
+  %dst.gep = getelementptr i32, i32* %dst, i64 %iv
+  %dst.gep.cast = bitcast i32* %dst.gep to <16 x i32>*
+  store <16 x i32> %ext, <16 x i32>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v16i16_to_v16i64_in_loop(i16* %src, i64* %dst) {
+; CHECK-LABEL:  _zext_v16i16_to_v16i64_in_loop:
+; CHECK: ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB14_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	add	x9, x0, x8
+; CHECK-NEXT: 	add	x8, x8, #32
+; CHECK-NEXT: 	cmp	x8, #256
+; CHECK-NEXT: 	ldp	q0, q1, [x9]
+; CHECK-NEXT: 	ushll.4s	v2, v0, #0
+; CHECK-NEXT: 	ushll2.4s	v0, v0, #0
+; CHECK-NEXT: 	ushll.4s	v3, v1, #0
+; CHECK-NEXT: 	ushll2.4s	v1, v1, #0
+; CHECK-NEXT: 	ushll2.2d	v5, v0, #0
+; CHECK-NEXT: 	ushll2.2d	v4, v1, #0
+; CHECK-NEXT: 	ushll.2d	v1, v1, #0
+; CHECK-NEXT: 	ushll.2d	v0, v0, #0
+; CHECK-NEXT: 	stp	q1, q4, [x1, #96]
+; CHECK-NEXT: 	ushll2.2d	v1, v3, #0
+; CHECK-NEXT: 	stp	q0, q5, [x1, #32]
+; CHECK-NEXT: 	ushll.2d	v3, v3, #0
+; CHECK-NEXT: 	ushll2.2d	v0, v2, #0
+; CHECK-NEXT: 	stp	q3, q1, [x1, #64]
+; CHECK-NEXT: 	ushll.2d	v1, v2, #0
+; CHECK-NEXT: 	stp	q1, q0, [x1], #128
+; CHECK-NEXT: 	b.ne	LBB14_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL:  zext_v16i16_to_v16i64_in_loop:
+; CHECK-BE:  // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB14_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x10, x1, #48
+; CHECK-BE-NEXT:  	add	x8, x8, #32
+; CHECK-BE-NEXT:  	cmp	x8, #256
+; CHECK-BE-NEXT:  	ld1	{ v0.8h }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x9, #16
+; CHECK-BE-NEXT:  	ld1	{ v2.8h }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #32
+; CHECK-BE-NEXT:  	ushll2	v1.4s, v0.8h, #0
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	ushll2	v3.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	ushll	v1.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v3.2d }, [x10]
+; CHECK-BE-NEXT:  	add	x10, x1, #112
+; CHECK-BE-NEXT:  	st1	{ v1.2d }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #16
+; CHECK-BE-NEXT:  	ushll2	v3.2d, v0.4s, #0
+; CHECK-BE-NEXT:  	ushll2	v1.4s, v2.8h, #0
+; CHECK-BE-NEXT:  	st1	{ v3.2d }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #96
+; CHECK-BE-NEXT:  	ushll2	v4.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	ushll	v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:  	ushll	v1.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v4.2d }, [x10]
+; CHECK-BE-NEXT:  	ushll	v2.4s, v2.4h, #0
+; CHECK-BE-NEXT:  	add	x10, x1, #80
+; CHECK-BE-NEXT:  	st1	{ v0.2d }, [x1]
+; CHECK-BE-NEXT:  	st1	{ v1.2d }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #64
+; CHECK-BE-NEXT:  	add	x1, x1, #128
+; CHECK-BE-NEXT:  	ushll	v3.2d, v2.2s, #0
+; CHECK-BE-NEXT:  	ushll2	v2.2d, v2.4s, #0
+; CHECK-BE-NEXT:  	st1	{ v3.2d }, [x9]
+; CHECK-BE-NEXT:  	st1	{ v2.2d }, [x10]
+; CHECK-BE-NEXT:  	b.ne	.LBB14_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i16, i16* %src, i64 %iv
+  %src.gep.cast = bitcast i16* %src.gep to <16 x i16>*
+  %load = load <16 x i16>, <16 x i16>* %src.gep.cast
+  %ext = zext <16 x i16> %load to <16 x i64>
+  %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+  %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
+  store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v16i32_to_v16i64_in_loop(i32* %src, i64* %dst) {
+; CHECK-LABEL:  _zext_v16i32_to_v16i64_in_loop:
+; CHECK: ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB15_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	add	x9, x0, x8
+; CHECK-NEXT: 	add	x8, x8, #64
+; CHECK-NEXT: 	cmp	x8, #512
+; CHECK-NEXT: 	ldp	q1, q0, [x9, #32]
+; CHECK-NEXT: 	ushll2.2d	v5, v1, #0
+; CHECK-NEXT: 	ushll.2d	v1, v1, #0
+; CHECK-NEXT: 	ldp	q3, q2, [x9]
+; CHECK-NEXT: 	ushll2.2d	v4, v0, #0
+; CHECK-NEXT: 	stp	q1, q5, [x1, #64]
+; CHECK-NEXT: 	ushll.2d	v0, v0, #0
+; CHECK-NEXT: 	stp	q0, q4, [x1, #96]
+; CHECK-NEXT: 	ushll2.2d	v1, v3, #0
+; CHECK-NEXT: 	ushll2.2d	v0, v2, #0
+; CHECK-NEXT: 	ushll.2d	v2, v2, #0
+; CHECK-NEXT: 	stp	q2, q0, [x1, #32]
+; CHECK-NEXT: 	ushll.2d	v0, v3, #0
+; CHECK-NEXT: 	stp	q0, q1, [x1], #128
+; CHECK-NEXT: 	b.ne	LBB15_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL:  zext_v16i32_to_v16i64_in_loop:
+; CHECK-BE:  // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB15_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x8, x8, #64
+; CHECK-BE-NEXT:  	add	x10, x9, #48
+; CHECK-BE-NEXT:  	add	x11, x9, #32
+; CHECK-BE-NEXT:  	cmp	x8, #512
+; CHECK-BE-NEXT:  	ld1	{ v0.4s }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x9, #16
+; CHECK-BE-NEXT:  	ld1	{ v1.4s }, [x10]
+; CHECK-BE-NEXT:  	add	x10, x1, #16
+; CHECK-BE-NEXT:  	ld1	{ v2.4s }, [x11]
+; CHECK-BE-NEXT:  	ushll2	v3.2d, v0.4s, #0
+; CHECK-BE-NEXT:  	ld1	{ v4.4s }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #112
+; CHECK-BE-NEXT:  	st1	{ v3.2d }, [x10]
+; CHECK-BE-NEXT:  	add	x10, x1, #80
+; CHECK-BE-NEXT:  	ushll2	v3.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	ushll2	v5.2d, v2.4s, #0
+; CHECK-BE-NEXT:  	st1	{ v3.2d }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #48
+; CHECK-BE-NEXT:  	st1	{ v5.2d }, [x10]
+; CHECK-BE-NEXT:  	add	x10, x1, #96
+; CHECK-BE-NEXT:  	ushll	v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:  	ushll	v3.2d, v4.2s, #0
+; CHECK-BE-NEXT:  	ushll2	v4.2d, v4.4s, #0
+; CHECK-BE-NEXT:  	st1	{ v0.2d }, [x1]
+; CHECK-BE-NEXT:  	ushll	v1.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v4.2d }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #64
+; CHECK-BE-NEXT:  	st1	{ v1.2d }, [x10]
+; CHECK-BE-NEXT:  	add	x10, x1, #32
+; CHECK-BE-NEXT:  	add	x1, x1, #128
+; CHECK-BE-NEXT:  	ushll	v2.2d, v2.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v3.2d }, [x10]
+; CHECK-BE-NEXT:  	st1	{ v2.2d }, [x9]
+; CHECK-BE-NEXT:  	b.ne	.LBB15_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i32, i32* %src, i64 %iv
+  %src.gep.cast = bitcast i32* %src.gep to <16 x i32>*
+  %load = load <16 x i32>, <16 x i32>* %src.gep.cast
+  %ext = zext <16 x i32> %load to <16 x i64>
+  %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+  %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
+  store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v8i8_to_v8i128_in_loop(i8* %src, i128* %dst) {
+; CHECK-LABEL: _zext_v8i8_to_v8i128_in_loop:
+; CHECK: ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB16_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	ldr	d0, [x0, x8]
+; CHECK-NEXT: 	add	x9, x1, #112
+; CHECK-NEXT: 	add	x10, x1, #80
+; CHECK-NEXT: 	str	xzr, [x1, #120]
+; CHECK-NEXT: 	str	xzr, [x1, #104]
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	str	xzr, [x1, #88]
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	ushll.8h	v0, v0, #0
+; CHECK-NEXT: 	str	xzr, [x1, #72]
+; CHECK-NEXT: 	str	xzr, [x1, #56]
+; CHECK-NEXT: 	ushll2.4s	v1, v0, #0
+; CHECK-NEXT: 	str	xzr, [x1, #40]
+; CHECK-NEXT: 	ushll.4s	v0, v0, #0
+; CHECK-NEXT: 	str	xzr, [x1, #24]
+; CHECK-NEXT: 	ushll2.2d	v2, v1, #0
+; CHECK-NEXT: 	str	xzr, [x1, #8]
+; CHECK-NEXT: 	ushll.2d	v1, v1, #0
+; CHECK-NEXT: 	st1.d	{ v2 }[1], [x9]
+; CHECK-NEXT: 	add	x9, x1, #48
+; CHECK-NEXT: 	str	d2, [x1, #96]
+; CHECK-NEXT: 	ushll2.2d	v2, v0, #0
+; CHECK-NEXT: 	st1.d	{ v1 }[1], [x10]
+; CHECK-NEXT: 	ushll.2d	v0, v0, #0
+; CHECK-NEXT: 	str	d1, [x1, #64]
+; CHECK-NEXT: 	str	d2, [x1, #32]
+; CHECK-NEXT: 	add	x10, x1, #16
+; CHECK-NEXT: 	str	d0, [x1]
+; CHECK-NEXT: 	add	x1, x1, #256
+; CHECK-NEXT: 	st1.d	{ v2 }[1], [x9]
+; CHECK-NEXT: 	st1.d	{ v0 }[1], [x10]
+; CHECK-NEXT: 	b.ne	LBB16_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL:  zext_v8i8_to_v8i128_in_loop:
+; CHECK-BE:  // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB16_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x10, x1, #88
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	ld1	{ v0.8b }, [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #120
+; CHECK-BE-NEXT:  	str	xzr, [x1, #112]
+; CHECK-BE-NEXT:  	str	xzr, [x1, #96]
+; CHECK-BE-NEXT:  	str	xzr, [x1, #80]
+; CHECK-BE-NEXT:  	ushll	v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	str	xzr, [x1, #64]
+; CHECK-BE-NEXT:  	str	xzr, [x1, #48]
+; CHECK-BE-NEXT:  	ushll2	v1.4s, v0.8h, #0
+; CHECK-BE-NEXT:  	str	xzr, [x1, #32]
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	str	xzr, [x1, #16]
+; CHECK-BE-NEXT:  	ushll2	v2.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	str	xzr, [x1]
+; CHECK-BE-NEXT:  	ushll	v1.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v2.d }[1], [x9]
+; CHECK-BE-NEXT:  	add	x9, x1, #56
+; CHECK-BE-NEXT:  	str	d2, [x1, #104]
+; CHECK-BE-NEXT:  	ushll2	v2.2d, v0.4s, #0
+; CHECK-BE-NEXT:  	st1	{ v1.d }[1], [x10]
+; CHECK-BE-NEXT:  	ushll	v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:  	str	d1, [x1, #72]
+; CHECK-BE-NEXT:  	str	d2, [x1, #40]
+; CHECK-BE-NEXT:  	add	x10, x1, #24
+; CHECK-BE-NEXT:  	str	d0, [x1, #8]
+; CHECK-BE-NEXT:  	add	x1, x1, #256
+; CHECK-BE-NEXT:  	st1	{ v2.d }[1], [x9]
+; CHECK-BE-NEXT:  	st1	{ v0.d }[1], [x10]
+; CHECK-BE-NEXT:  	b.ne	.LBB16_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+  %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+  %ext = zext <8 x i8> %load to <8 x i128>
+  %dst.gep = getelementptr i128, i128* %dst, i64 %iv
+  %dst.gep.cast = bitcast i128* %dst.gep to <8 x i128>*
+  store <8 x i128> %ext, <8 x i128>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(i8* %src, i64* %dst) {
+; CHECK-LABEL:  _zext_v8i8_to_v8i64_with_add_in_sequence_in_loop:
+; CHECK: ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: 	add	x9, x0, #8
+; CHECK-NEXT: LBB17_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	add	x10, x1, x8
+; CHECK-NEXT: 	add	x8, x8, #128
+; CHECK-NEXT: 	ldp	d0, d1, [x9, #-8]
+; CHECK-NEXT: 	add	x9, x9, #16
+; CHECK-NEXT: 	cmp	x8, #1024
+; CHECK-NEXT: 	ldp	q3, q2, [x10, #32]
+; CHECK-NEXT: 	ushll.8h	v0, v0, #0
+; CHECK-NEXT: 	ushll.8h	v1, v1, #0
+; CHECK-NEXT: 	ushll2.4s	v6, v0, #0
+; CHECK-NEXT: 	ushll.4s	v0, v0, #0
+; CHECK-NEXT: 	ldp	q5, q4, [x10]
+; CHECK-NEXT: 	uaddw2.2d	v2, v2, v6
+; CHECK-NEXT: 	uaddw.2d	v3, v3, v6
+; CHECK-NEXT: 	ushll2.4s	v7, v1, #0
+; CHECK-NEXT: 	ushll.4s	v1, v1, #0
+; CHECK-NEXT: 	stp	q3, q2, [x10, #32]
+; CHECK-NEXT: 	ldp	q17, q16, [x10, #96]
+; CHECK-NEXT: 	uaddw2.2d	v4, v4, v0
+; CHECK-NEXT: 	uaddw.2d	v0, v5, v0
+; CHECK-NEXT: 	uaddw.2d	v3, v17, v7
+; CHECK-NEXT: 	stp	q0, q4, [x10]
+; CHECK-NEXT: 	ldp	q6, q18, [x10, #64]
+; CHECK-NEXT: 	uaddw2.2d	v2, v16, v7
+; CHECK-NEXT: 	stp	q3, q2, [x10, #96]
+; CHECK-NEXT: 	uaddw2.2d	v0, v18, v1
+; CHECK-NEXT: 	uaddw.2d	v1, v6, v1
+; CHECK-NEXT: 	stp	q1, q0, [x10, #64]
+; CHECK-NEXT: 	b.ne	LBB17_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i64_with_add_in_sequence_in_loop:
+; CHECK-BE:  // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  	add	x9, x0, #8
+; CHECK-BE-NEXT:  .LBB17_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	sub	x12, x9, #8
+; CHECK-BE-NEXT:  	add	x10, x1, x8
+; CHECK-BE-NEXT:  	add	x11, x10, #48
+; CHECK-BE-NEXT:  	add	x13, x10, #32
+; CHECK-BE-NEXT:  	add	x14, x10, #16
+; CHECK-BE-NEXT:  	ld1	{ v0.8b }, [x9]
+; CHECK-BE-NEXT:  	ld1	{ v2.8b }, [x12]
+; CHECK-BE-NEXT:  	add	x12, x10, #112
+; CHECK-BE-NEXT:  	ld1	{ v1.2d }, [x11]
+; CHECK-BE-NEXT:  	add	x15, x10, #96
+; CHECK-BE-NEXT:  	add	x16, x10, #64
+; CHECK-BE-NEXT:  	add	x17, x10, #80
+; CHECK-BE-NEXT:  	ld1	{ v3.2d }, [x13]
+; CHECK-BE-NEXT:  	add	x8, x8, #128
+; CHECK-BE-NEXT:  	ushll	v2.8h, v2.8b, #0
+; CHECK-BE-NEXT:  	add	x9, x9, #16
+; CHECK-BE-NEXT:  	ld1	{ v5.2d }, [x10]
+; CHECK-BE-NEXT:  	cmp	x8, #1024
+; CHECK-BE-NEXT:  	ushll2	v7.4s, v2.8h, #0
+; CHECK-BE-NEXT:  	ld1	{ v6.2d }, [x14]
+; CHECK-BE-NEXT:  	uaddw2	v1.2d, v1.2d, v7.4s
+; CHECK-BE-NEXT:  	ushll	v2.4s, v2.4h, #0
+; CHECK-BE-NEXT:  	ld1	{ v16.2d }, [x12]
+; CHECK-BE-NEXT:  	uaddw	v3.2d, v3.2d, v7.2s
+; CHECK-BE-NEXT:  	ld1	{ v17.2d }, [x15]
+; CHECK-BE-NEXT:  	st1	{ v1.2d }, [x11]
+; CHECK-BE-NEXT:  	ushll	v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	ld1	{ v7.2d }, [x17]
+; CHECK-BE-NEXT:  	st1	{ v3.2d }, [x13]
+; CHECK-BE-NEXT:  	uaddw	v5.2d, v5.2d, v2.2s
+; CHECK-BE-NEXT:  	ld1	{ v1.2d }, [x16]
+; CHECK-BE-NEXT:  	uaddw2	v2.2d, v6.2d, v2.4s
+; CHECK-BE-NEXT:  	ushll2	v4.4s, v0.8h, #0
+; CHECK-BE-NEXT:  	st1	{ v5.2d }, [x10]
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	uaddw2	v6.2d, v16.2d, v4.4s
+; CHECK-BE-NEXT:  	st1	{ v2.2d }, [x14]
+; CHECK-BE-NEXT:  	uaddw	v3.2d, v17.2d, v4.2s
+; CHECK-BE-NEXT:  	uaddw2	v2.2d, v7.2d, v0.4s
+; CHECK-BE-NEXT:  	uaddw	v0.2d, v1.2d, v0.2s
+; CHECK-BE-NEXT:  	st1	{ v6.2d }, [x12]
+; CHECK-BE-NEXT:  	st1	{ v3.2d }, [x15]
+; CHECK-BE-NEXT:  	st1	{ v2.2d }, [x17]
+; CHECK-BE-NEXT:  	st1	{ v0.2d }, [x16]
+; CHECK-BE-NEXT:  	b.ne	.LBB17_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+  %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+  %src.gep.2 = getelementptr i8, i8* %src.gep, i64 8
+  %src.gep.cast.2 = bitcast i8* %src.gep.2 to <8 x i8>*
+  %load.2 = load <8 x i8>, <8 x i8>* %src.gep.cast.2
+  %ext = zext <8 x i8> %load to <8 x i64>
+  %ext.2 = zext <8 x i8> %load.2 to <8 x i64>
+  %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+  %dst.gep.cast = bitcast i64* %dst.gep to <8 x i64>*
+  %load.dst = load <8 x i64>, <8 x i64>* %dst.gep.cast
+  %dst.gep.2 = getelementptr i64, i64* %dst.gep, i64 8
+  %dst.gep.cast.2 = bitcast i64* %dst.gep.2 to <8 x i64>*
+  %load.dst.2 = load <8 x i64>, <8 x i64>* %dst.gep.cast.2
+  %sum = add <8 x i64> %load.dst, %ext
+  %sum.2 = add <8 x i64> %load.dst.2, %ext.2
+  %dst.gep.cast.3 = bitcast i64* %dst.gep to <8 x i64>*
+  store <8 x i64> %sum, <8 x i64>* %dst.gep.cast.3
+  %dst.gep.cast.4 = bitcast i64* %dst.gep.2 to <8 x i64>*
+  store <8 x i64> %sum.2, <8 x i64>* %dst.gep.cast.4
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v16i8_to_v16i64_in_sequence_in_loop(i8* %src, i64* %dst) {
+; CHECK-LABEL: _zext_v16i8_to_v16i64_in_sequence_in_loop:
+; CHECK: ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: 	add	x9, x1, #128
+; CHECK-NEXT: LBB18_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	add	x10, x0, x8
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	ldp	q0, q1, [x10]
+; CHECK-NEXT: 	ushll.8h	v2, v0, #0
+; CHECK-NEXT: 	ushll2.8h	v0, v0, #0
+; CHECK-NEXT: 	ushll.4s	v4, v2, #0
+; CHECK-NEXT: 	ushll2.4s	v5, v0, #0
+; CHECK-NEXT: 	ushll2.4s	v2, v2, #0
+; CHECK-NEXT: 	ushll.4s	v0, v0, #0
+; CHECK-NEXT: 	ushll2.2d	v6, v5, #0
+; CHECK-NEXT: 	ushll.2d	v5, v5, #0
+; CHECK-NEXT: 	ushll2.8h	v3, v1, #0
+; CHECK-NEXT: 	ushll2.2d	v7, v0, #0
+; CHECK-NEXT: 	stp	q5, q6, [x9, #-32]
+; CHECK-NEXT: 	ushll.2d	v0, v0, #0
+; CHECK-NEXT: 	ushll2.2d	v5, v2, #0
+; CHECK-NEXT: 	ushll.2d	v2, v2, #0
+; CHECK-NEXT: 	stp	q0, q7, [x9, #-64]
+; CHECK-NEXT: 	ushll2.2d	v0, v4, #0
+; CHECK-NEXT: 	stp	q2, q5, [x9, #-96]
+; CHECK-NEXT: 	ushll2.4s	v5, v3, #0
+; CHECK-NEXT: 	ushll.2d	v2, v4, #0
+; CHECK-NEXT: 	ushll2.2d	v4, v5, #0
+; CHECK-NEXT: 	stp	q2, q0, [x9, #-128]
+; CHECK-NEXT: 	ushll.2d	v0, v5, #0
+; CHECK-NEXT: 	ushll.4s	v2, v3, #0
+; CHECK-NEXT: 	stp	q0, q4, [x9, #96]
+; CHECK-NEXT: 	ushll.8h	v0, v1, #0
+; CHECK-NEXT: 	ushll2.2d	v1, v2, #0
+; CHECK-NEXT: 	ushll.2d	v2, v2, #0
+; CHECK-NEXT: 	ushll2.4s	v3, v0, #0
+; CHECK-NEXT: 	stp	q2, q1, [x9, #64]
+; CHECK-NEXT: 	ushll2.2d	v1, v3, #0
+; CHECK-NEXT: 	ushll.2d	v2, v3, #0
+; CHECK-NEXT: 	ushll.4s	v0, v0, #0
+; CHECK-NEXT: 	stp	q2, q1, [x9, #32]
+; CHECK-NEXT: 	ushll2.2d	v1, v0, #0
+; CHECK-NEXT: 	ushll.2d	v0, v0, #0
+; CHECK-NEXT: 	stp	q0, q1, [x9], #128
+; CHECK-NEXT: 	b.ne	LBB18_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL: zext_v16i8_to_v16i64_in_sequence_in_loop:
+; CHECK-BE:  // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  	add	x9, x1, #128
+; CHECK-BE-NEXT:  .LBB18_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x10, x0, x8
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	add	x11, x10, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	ld1	{ v0.16b }, [x10]
+; CHECK-BE-NEXT:  	sub	x10, x9, #16
+; CHECK-BE-NEXT:  	ld1	{ v3.16b }, [x11]
+; CHECK-BE-NEXT:  	sub	x11, x9, #32
+; CHECK-BE-NEXT:  	ushll2	v1.8h, v0.16b, #0
+; CHECK-BE-NEXT:  	ushll	v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	ushll2	v2.4s, v1.8h, #0
+; CHECK-BE-NEXT:  	ushll	v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:  	ushll2	v4.2d, v2.4s, #0
+; CHECK-BE-NEXT:  	ushll	v2.2d, v2.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v4.2d }, [x10]
+; CHECK-BE-NEXT:  	sub	x10, x9, #48
+; CHECK-BE-NEXT:  	ushll2	v4.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	st1	{ v2.2d }, [x11]
+; CHECK-BE-NEXT:  	ushll2	v2.4s, v0.8h, #0
+; CHECK-BE-NEXT:  	sub	x11, x9, #80
+; CHECK-BE-NEXT:  	st1	{ v4.2d }, [x10]
+; CHECK-BE-NEXT:  	sub	x10, x9, #64
+; CHECK-BE-NEXT:  	ushll2	v5.2d, v2.4s, #0
+; CHECK-BE-NEXT:  	ushll	v1.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	st1	{ v5.2d }, [x11]
+; CHECK-BE-NEXT:  	sub	x11, x9, #96
+; CHECK-BE-NEXT:  	st1	{ v1.2d }, [x10]
+; CHECK-BE-NEXT:  	ushll	v4.8h, v3.8b, #0
+; CHECK-BE-NEXT:  	sub	x10, x9, #112
+; CHECK-BE-NEXT:  	ushll2	v3.8h, v3.16b, #0
+; CHECK-BE-NEXT:  	ushll	v5.2d, v0.2s, #0
+; CHECK-BE-NEXT:  	ushll2	v0.2d, v0.4s, #0
+; CHECK-BE-NEXT:  	ushll	v2.2d, v2.2s, #0
+; CHECK-BE-NEXT:  	ushll	v1.4s, v4.4h, #0
+; CHECK-BE-NEXT:  	st1	{ v0.2d }, [x10]
+; CHECK-BE-NEXT:  	ushll2	v6.4s, v3.8h, #0
+; CHECK-BE-NEXT:  	st1	{ v2.2d }, [x11]
+; CHECK-BE-NEXT:  	sub	x11, x9, #128
+; CHECK-BE-NEXT:  	add	x10, x9, #112
+; CHECK-BE-NEXT:  	ushll	v2.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	ushll2	v0.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	ushll2	v1.2d, v6.4s, #0
+; CHECK-BE-NEXT:  	st1	{ v5.2d }, [x11]
+; CHECK-BE-NEXT:  	ushll	v3.4s, v3.4h, #0
+; CHECK-BE-NEXT:  	add	x11, x9, #96
+; CHECK-BE-NEXT:  	st1	{ v1.2d }, [x10]
+; CHECK-BE-NEXT:  	add	x10, x9, #80
+; CHECK-BE-NEXT:  	ushll	v5.2d, v6.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v2.2d }, [x9]
+; CHECK-BE-NEXT:  	ushll2	v1.4s, v4.8h, #0
+; CHECK-BE-NEXT:  	ushll2	v4.2d, v3.4s, #0
+; CHECK-BE-NEXT:  	st1	{ v5.2d }, [x11]
+; CHECK-BE-NEXT:  	add	x11, x9, #48
+; CHECK-BE-NEXT:  	st1	{ v4.2d }, [x10]
+; CHECK-BE-NEXT:  	add	x10, x9, #64
+; CHECK-BE-NEXT:  	ushll2	v5.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	ushll	v3.2d, v3.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v5.2d }, [x11]
+; CHECK-BE-NEXT:  	add	x11, x9, #16
+; CHECK-BE-NEXT:  	st1	{ v3.2d }, [x10]
+; CHECK-BE-NEXT:  	add	x10, x9, #32
+; CHECK-BE-NEXT:  	add	x9, x9, #128
+; CHECK-BE-NEXT:  	ushll	v1.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	st1	{ v0.2d }, [x11]
+; CHECK-BE-NEXT:  	st1	{ v1.2d }, [x10]
+; CHECK-BE-NEXT:  	b.ne	.LBB18_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
+  %load = load <16 x i8>, <16 x i8>* %src.gep.cast
+  %src.gep.2 = getelementptr i8, i8* %src.gep, i64 16
+  %src.gep.cast.2 = bitcast i8* %src.gep.2 to <16 x i8>*
+  %load.2 = load <16 x i8>, <16 x i8>* %src.gep.cast.2
+  %ext = zext <16 x i8> %load to <16 x i64>
+  %ext.2 = zext <16 x i8> %load.2 to <16 x i64>
+  %dst.gep = getelementptr i64, i64* %dst, i64 %iv
+  %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
+  store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
+  %dst.gep.2 = getelementptr i64, i64* %dst.gep, i64 16
+  %dst.gep.cast.2 = bitcast i64* %dst.gep.2 to <16 x i64>*
+  store <16 x i64> %ext.2, <16 x i64>* %dst.gep.cast.2
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(i8* %src, i32* %dst) {
 ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop_scalable_vectors:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:  LBB8_1: ; %loop
+; CHECK-NEXT:  LBB19_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, x8]
@@ -815,7 +1865,7 @@ define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(i8* %src, i32* %dst)
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x9, #2, mul vl]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [x9, #3, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x9, #1, mul vl]
-; CHECK-NEXT:    b.ne LBB8_1
+; CHECK-NEXT:    b.ne LBB19_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
 ;
@@ -823,7 +1873,7 @@ define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(i8* %src, i32* %dst)
 ; CHECK-BE:       // %bb.0: // %entry
 ; CHECK-BE-NEXT:    mov x8, xzr
 ; CHECK-BE-NEXT:    ptrue p0.s
-; CHECK-BE-NEXT:  .LBB8_1: // %loop
+; CHECK-BE-NEXT:  .LBB19_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    ld1b { z0.s }, p0/z, [x0, x8]
@@ -841,7 +1891,7 @@ define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(i8* %src, i32* %dst)
 ; CHECK-BE-NEXT:    st1w { z1.s }, p0, [x9, #2, mul vl]
 ; CHECK-BE-NEXT:    st1w { z2.s }, p0, [x9, #3, mul vl]
 ; CHECK-BE-NEXT:    st1w { z0.s }, p0, [x9, #1, mul vl]
-; CHECK-BE-NEXT:    b.ne .LBB8_1
+; CHECK-BE-NEXT:    b.ne .LBB19_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
 entry:
@@ -864,3 +1914,519 @@ loop:
 exit:
   ret void
 }
+
+define void @zext_v20i8_to_v20i24_in_loop(i8* %src, i24* %dst) {
+; CHECK-LABEL: _zext_v20i8_to_v20i24_in_loop:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB20_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	add	x9, x0, x8
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	ldp	q0, q3, [x9]
+; CHECK-NEXT: 	strb	wzr, [x1, #59]
+; CHECK-NEXT: 	strb	wzr, [x1, #56]
+; CHECK-NEXT: 	strb	wzr, [x1, #53]
+; CHECK-NEXT: 	ushll2.8h	v1, v0, #0
+; CHECK-NEXT: 	strb	wzr, [x1, #50]
+; CHECK-NEXT: 	ushll.8h	v0, v0, #0
+; CHECK-NEXT: 	strb	wzr, [x1, #47]
+; CHECK-NEXT: 	ushll2.4s	v2, v1, #0
+; CHECK-NEXT: 	strb	wzr, [x1, #44]
+; CHECK-NEXT: 	ushll.4s	v1, v1, #0
+; CHECK-NEXT: 	strb	wzr, [x1, #41]
+; CHECK-NEXT: 	mov.s	w9, v2[2]
+; CHECK-NEXT: 	strb	wzr, [x1, #38]
+; CHECK-NEXT: 	fmov	w10, s2
+; CHECK-NEXT: 	fmov	w12, s1
+; CHECK-NEXT: 	mov.s	w11, v2[3]
+; CHECK-NEXT: 	mov.s	w13, v2[1]
+; CHECK-NEXT: 	ushll.8h	v2, v3, #0
+; CHECK-NEXT: 	strb	wzr, [x1, #35]
+; CHECK-NEXT: 	strh	w9, [x1, #42]
+; CHECK-NEXT: 	mov.s	w9, v1[1]
+; CHECK-NEXT: 	strh	w10, [x1, #36]
+; CHECK-NEXT: 	mov.s	w10, v1[3]
+; CHECK-NEXT: 	strh	w12, [x1, #24]
+; CHECK-NEXT: 	mov.s	w12, v1[2]
+; CHECK-NEXT: 	ushll2.4s	v1, v0, #0
+; CHECK-NEXT: 	sturh	w11, [x1, #45]
+; CHECK-NEXT: 	sturh	w9, [x1, #27]
+; CHECK-NEXT: 	mov.s	w9, v1[1]
+; CHECK-NEXT: 	mov.s	w11, v1[2]
+; CHECK-NEXT: 	ushll.4s	v2, v2, #0
+; CHECK-NEXT: 	sturh	w10, [x1, #33]
+; CHECK-NEXT: 	ushll.4s	v0, v0, #0
+; CHECK-NEXT: 	sturh	w13, [x1, #39]
+; CHECK-NEXT: 	mov.s	w10, v1[3]
+; CHECK-NEXT: 	strh	w12, [x1, #30]
+; CHECK-NEXT: 	sturh	w9, [x1, #15]
+; CHECK-NEXT: 	mov.s	w9, v2[2]
+; CHECK-NEXT: 	strh	w11, [x1, #18]
+; CHECK-NEXT: 	mov.s	w11, v2[3]
+; CHECK-NEXT: 	strb	wzr, [x1, #32]
+; CHECK-NEXT: 	sturh	w10, [x1, #21]
+; CHECK-NEXT: 	fmov	w10, s1
+; CHECK-NEXT: 	strh	w9, [x1, #54]
+; CHECK-NEXT: 	mov.s	w9, v2[1]
+; CHECK-NEXT: 	sturh	w11, [x1, #57]
+; CHECK-NEXT: 	strh	w10, [x1, #12]
+; CHECK-NEXT: 	strb	wzr, [x1, #29]
+; CHECK-NEXT: 	sturh	w9, [x1, #51]
+; CHECK-NEXT: 	fmov	w9, s2
+; CHECK-NEXT: 	strb	wzr, [x1, #26]
+; CHECK-NEXT: 	strb	wzr, [x1, #23]
+; CHECK-NEXT: 	strh	w9, [x1, #48]
+; CHECK-NEXT: 	mov.s	w9, v0[3]
+; CHECK-NEXT: 	strb	wzr, [x1, #20]
+; CHECK-NEXT: 	strb	wzr, [x1, #17]
+; CHECK-NEXT: 	strb	wzr, [x1, #14]
+; CHECK-NEXT: 	sturh	w9, [x1, #9]
+; CHECK-NEXT: 	mov.s	w9, v0[2]
+; CHECK-NEXT: 	strb	wzr, [x1, #11]
+; CHECK-NEXT: 	strb	wzr, [x1, #8]
+; CHECK-NEXT: 	strb	wzr, [x1, #5]
+; CHECK-NEXT: 	strh	w9, [x1, #6]
+; CHECK-NEXT: 	mov.s	w9, v0[1]
+; CHECK-NEXT: 	strb	wzr, [x1, #2]
+; CHECK-NEXT: 	sturh	w9, [x1, #3]
+; CHECK-NEXT: 	fmov	w9, s0
+; CHECK-NEXT: 	strh	w9, [x1], #64
+; CHECK-NEXT: 	b.ne	LBB20_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL: zext_v20i8_to_v20i24_in_loop:
+; CHECK-BE:  // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB20_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	add	x10, x9, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	ld1	{ v1.16b }, [x9]
+; CHECK-BE-NEXT:  	ld1	{ v0.16b }, [x10]
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #57]
+; CHECK-BE-NEXT:  	strh	wzr, [x1, #54]
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #51]
+; CHECK-BE-NEXT:  	ushll2	v2.8h, v1.16b, #0
+; CHECK-BE-NEXT:  	strh	wzr, [x1, #48]
+; CHECK-BE-NEXT:  	ushll	v1.8h, v1.8b, #0
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #45]
+; CHECK-BE-NEXT:  	ushll2	v3.4s, v2.8h, #0
+; CHECK-BE-NEXT:  	strh	wzr, [x1, #42]
+; CHECK-BE-NEXT:  	ushll	v2.4s, v2.4h, #0
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #39]
+; CHECK-BE-NEXT:  	fmov	w9, s3
+; CHECK-BE-NEXT:  	strh	wzr, [x1, #36]
+; CHECK-BE-NEXT:  	mov	w10, v3.s[3]
+; CHECK-BE-NEXT:  	fmov	w11, s2
+; CHECK-BE-NEXT:  	mov	w14, v2.s[3]
+; CHECK-BE-NEXT:  	mov	w12, v3.s[2]
+; CHECK-BE-NEXT:  	strb	w9, [x1, #38]
+; CHECK-BE-NEXT:  	mov	w9, v2.s[2]
+; CHECK-BE-NEXT:  	ushll	v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	strb	w11, [x1, #26]
+; CHECK-BE-NEXT:  	strb	w10, [x1, #47]
+; CHECK-BE-NEXT:  	mov	w10, v2.s[1]
+; CHECK-BE-NEXT:  	ushll2	v2.4s, v1.8h, #0
+; CHECK-BE-NEXT:  	mov	w13, v3.s[1]
+; CHECK-BE-NEXT:  	strb	w9, [x1, #32]
+; CHECK-BE-NEXT:  	mov	w9, v2.s[1]
+; CHECK-BE-NEXT:  	mov	w11, v2.s[3]
+; CHECK-BE-NEXT:  	ushll	v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:  	strb	w12, [x1, #44]
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	strb	w10, [x1, #29]
+; CHECK-BE-NEXT:  	mov	w12, v2.s[2]
+; CHECK-BE-NEXT:  	strb	w13, [x1, #41]
+; CHECK-BE-NEXT:  	strb	w9, [x1, #17]
+; CHECK-BE-NEXT:  	mov	w9, v1.s[2]
+; CHECK-BE-NEXT:  	strb	w11, [x1, #23]
+; CHECK-BE-NEXT:  	mov	w11, v1.s[3]
+; CHECK-BE-NEXT:  	fmov	w10, s2
+; CHECK-BE-NEXT:  	strb	w14, [x1, #35]
+; CHECK-BE-NEXT:  	strb	w12, [x1, #20]
+; CHECK-BE-NEXT:  	strb	w9, [x1, #8]
+; CHECK-BE-NEXT:  	mov	w9, v1.s[1]
+; CHECK-BE-NEXT:  	strb	w10, [x1, #14]
+; CHECK-BE-NEXT:  	strb	w11, [x1, #11]
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #33]
+; CHECK-BE-NEXT:  	strb	w9, [x1, #5]
+; CHECK-BE-NEXT:  	fmov	w9, s1
+; CHECK-BE-NEXT:  	strh	wzr, [x1, #30]
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #27]
+; CHECK-BE-NEXT:  	strb	w9, [x1, #2]
+; CHECK-BE-NEXT:  	mov	w9, v0.s[3]
+; CHECK-BE-NEXT:  	strh	wzr, [x1, #24]
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #21]
+; CHECK-BE-NEXT:  	strh	wzr, [x1, #18]
+; CHECK-BE-NEXT:  	strb	w9, [x1, #59]
+; CHECK-BE-NEXT:  	mov	w9, v0.s[2]
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #15]
+; CHECK-BE-NEXT:  	strh	wzr, [x1, #12]
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #9]
+; CHECK-BE-NEXT:  	strb	w9, [x1, #56]
+; CHECK-BE-NEXT:  	mov	w9, v0.s[1]
+; CHECK-BE-NEXT:  	strh	wzr, [x1, #6]
+; CHECK-BE-NEXT:  	sturh	wzr, [x1, #3]
+; CHECK-BE-NEXT:  	strh	wzr, [x1]
+; CHECK-BE-NEXT:  	strb	w9, [x1, #53]
+; CHECK-BE-NEXT:  	fmov	w9, s0
+; CHECK-BE-NEXT:  	strb	w9, [x1, #50]
+; CHECK-BE-NEXT:  	add	x1, x1, #64
+; CHECK-BE-NEXT:  	b.ne	.LBB20_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <20 x i8>*
+  %load = load <20 x i8>, <20 x i8>* %src.gep.cast
+  %ext = zext <20 x i8> %load to <20 x i24>
+  %dst.gep = getelementptr i24, i24* %dst, i64 %iv
+  %dst.gep.cast = bitcast i24* %dst.gep to <20 x i24>*
+  store <20 x i24> %ext, <20 x i24>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v23i8_to_v23i48_in_loop(i8* %src, i48* %dst) {
+; CHECK-LABEL:  _zext_v23i8_to_v23i48_in_loop:
+; CHECK:  ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB21_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	add	x9, x0, x8
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	ldp	q1, q2, [x9]
+; CHECK-NEXT: 	strh	wzr, [x1, #136]
+; CHECK-NEXT: 	strh	wzr, [x1, #130]
+; CHECK-NEXT: 	strh	wzr, [x1, #124]
+; CHECK-NEXT: 	ushll2.8h	v3, v1, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #118]
+; CHECK-NEXT: 	ushll.8h	v0, v1, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #112]
+; CHECK-NEXT: 	ushll.8h	v2, v2, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #106]
+; CHECK-NEXT: 	ushll.4s	v1, v3, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #100]
+; CHECK-NEXT: 	ushll2.4s	v3, v3, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #94]
+; CHECK-NEXT: 	ushll2.4s	v4, v2, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #88]
+; CHECK-NEXT: 	ushll.2d	v5, v3, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #82]
+; CHECK-NEXT: 	ushll2.2d	v3, v3, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #76]
+; CHECK-NEXT: 	ushll2.2d	v6, v4, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #70]
+; CHECK-NEXT: 	fmov	x10, d3
+; CHECK-NEXT: 	strh	wzr, [x1, #64]
+; CHECK-NEXT: 	mov.d	x9, v3[1]
+; CHECK-NEXT: 	fmov	x11, d6
+; CHECK-NEXT: 	ushll2.4s	v6, v0, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #58]
+; CHECK-NEXT: 	fmov	x12, d5
+; CHECK-NEXT: 	str	w10, [x1, #84]
+; CHECK-NEXT: 	mov.d	x10, v5[1]
+; CHECK-NEXT: 	str	w11, [x1, #132]
+; CHECK-NEXT: 	ushll2.2d	v5, v6, #0
+; CHECK-NEXT: 	stur	w9, [x1, #90]
+; CHECK-NEXT: 	ushll2.2d	v3, v1, #0
+; CHECK-NEXT: 	str	w12, [x1, #72]
+; CHECK-NEXT: 	fmov	x9, d5
+; CHECK-NEXT: 	strh	wzr, [x1, #52]
+; CHECK-NEXT: 	fmov	x13, d3
+; CHECK-NEXT: 	stur	w10, [x1, #78]
+; CHECK-NEXT: 	mov.d	x11, v3[1]
+; CHECK-NEXT: 	strh	wzr, [x1, #46]
+; CHECK-NEXT: 	ushll.2d	v3, v4, #0
+; CHECK-NEXT: 	str	w9, [x1, #36]
+; CHECK-NEXT: 	ushll.4s	v2, v2, #0
+; CHECK-NEXT: 	str	w13, [x1, #60]
+; CHECK-NEXT: 	fmov	x9, d3
+; CHECK-NEXT: 	strh	wzr, [x1, #40]
+; CHECK-NEXT: 	ushll2.2d	v4, v2, #0
+; CHECK-NEXT: 	stur	w11, [x1, #66]
+; CHECK-NEXT: 	ushll.2d	v1, v1, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #34]
+; CHECK-NEXT: 	str	w9, [x1, #120]
+; CHECK-NEXT: 	fmov	x9, d4
+; CHECK-NEXT: 	mov.d	x11, v1[1]
+; CHECK-NEXT: 	strh	wzr, [x1, #28]
+; CHECK-NEXT: 	ushll.4s	v0, v0, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #22]
+; CHECK-NEXT: 	str	w9, [x1, #108]
+; CHECK-NEXT: 	fmov	x9, d1
+; CHECK-NEXT: 	ushll.2d	v1, v6, #0
+; CHECK-NEXT: 	strh	wzr, [x1, #16]
+; CHECK-NEXT: 	mov.d	x10, v3[1]
+; CHECK-NEXT: 	stur	w11, [x1, #54]
+; CHECK-NEXT: 	str	w9, [x1, #48]
+; CHECK-NEXT: 	mov.d	x9, v1[1]
+; CHECK-NEXT: 	strh	wzr, [x1, #10]
+; CHECK-NEXT: 	mov.d	x12, v5[1]
+; CHECK-NEXT: 	strh	wzr, [x1, #4]
+; CHECK-NEXT: 	stur	w10, [x1, #126]
+; CHECK-NEXT: 	mov.d	x10, v4[1]
+; CHECK-NEXT: 	stur	w9, [x1, #30]
+; CHECK-NEXT: 	fmov	x9, d1
+; CHECK-NEXT: 	ushll2.2d	v1, v0, #0
+; CHECK-NEXT: 	stur	w12, [x1, #42]
+; CHECK-NEXT: 	ushll.2d	v0, v0, #0
+; CHECK-NEXT: 	str	w9, [x1, #24]
+; CHECK-NEXT: 	mov.d	x9, v1[1]
+; CHECK-NEXT: 	stur	w10, [x1, #114]
+; CHECK-NEXT: 	stur	w9, [x1, #18]
+; CHECK-NEXT: 	fmov	x9, d1
+; CHECK-NEXT: 	ushll.2d	v1, v2, #0
+; CHECK-NEXT: 	str	w9, [x1, #12]
+; CHECK-NEXT: 	mov.d	x9, v1[1]
+; CHECK-NEXT: 	stur	w9, [x1, #102]
+; CHECK-NEXT: 	fmov	x9, d1
+; CHECK-NEXT: 	str	w9, [x1, #96]
+; CHECK-NEXT: 	mov.d	x9, v0[1]
+; CHECK-NEXT: 	stur	w9, [x1, #6]
+; CHECK-NEXT: 	fmov	x9, d0
+; CHECK-NEXT: 	str	w9, [x1], #128
+; CHECK-NEXT: 	b.ne	LBB21_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL: zext_v23i8_to_v23i48_in_loop:
+; CHECK-BE:  // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB21_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	add	x10, x9, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	ld1	{ v2.16b }, [x9]
+; CHECK-BE-NEXT:  	ld1	{ v1.16b }, [x10]
+; CHECK-BE-NEXT:  	str	wzr, [x1, #132]
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #126]
+; CHECK-BE-NEXT:  	str	wzr, [x1, #120]
+; CHECK-BE-NEXT:  	ushll2	v3.8h, v2.16b, #0
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #114]
+; CHECK-BE-NEXT:  	ushll	v0.8h, v2.8b, #0
+; CHECK-BE-NEXT:  	str	wzr, [x1, #108]
+; CHECK-BE-NEXT:  	ushll	v4.4s, v3.4h, #0
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #102]
+; CHECK-BE-NEXT:  	ushll2	v3.4s, v3.8h, #0
+; CHECK-BE-NEXT:  	str	wzr, [x1, #96]
+; CHECK-BE-NEXT:  	ushll2	v2.4s, v0.8h, #0
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #90]
+; CHECK-BE-NEXT:  	ushll2	v7.2d, v3.4s, #0
+; CHECK-BE-NEXT:  	str	wzr, [x1, #84]
+; CHECK-BE-NEXT:  	ushll	v3.2d, v3.2s, #0
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #78]
+; CHECK-BE-NEXT:  	ushll	v1.8h, v1.8b, #0
+; CHECK-BE-NEXT:  	str	wzr, [x1, #72]
+; CHECK-BE-NEXT:  	fmov	x12, d3
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #66]
+; CHECK-BE-NEXT:  	ushll2	v5.2d, v2.4s, #0
+; CHECK-BE-NEXT:  	fmov	x10, d7
+; CHECK-BE-NEXT:  	mov	x9, v7.d[1]
+; CHECK-BE-NEXT:  	str	wzr, [x1, #60]
+; CHECK-BE-NEXT:  	mov	x11, v3.d[1]
+; CHECK-BE-NEXT:  	strh	w12, [x1, #76]
+; CHECK-BE-NEXT:  	ushll2	v3.4s, v1.8h, #0
+; CHECK-BE-NEXT:  	strh	w10, [x1, #88]
+; CHECK-BE-NEXT:  	ushll2	v6.2d, v4.4s, #0
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #54]
+; CHECK-BE-NEXT:  	fmov	x14, d5
+; CHECK-BE-NEXT:  	strh	w9, [x1, #94]
+; CHECK-BE-NEXT:  	mov	x12, v5.d[1]
+; CHECK-BE-NEXT:  	strh	w11, [x1, #82]
+; CHECK-BE-NEXT:  	ushll2	v5.2d, v3.4s, #0
+; CHECK-BE-NEXT:  	str	wzr, [x1, #48]
+; CHECK-BE-NEXT:  	mov	x10, v6.d[1]
+; CHECK-BE-NEXT:  	strh	w14, [x1, #40]
+; CHECK-BE-NEXT:  	fmov	x9, d5
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #42]
+; CHECK-BE-NEXT:  	ushll	v4.2d, v4.2s, #0
+; CHECK-BE-NEXT:  	strh	w12, [x1, #46]
+; CHECK-BE-NEXT:  	ushll	v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	str	wzr, [x1, #36]
+; CHECK-BE-NEXT:  	strh	w10, [x1, #70]
+; CHECK-BE-NEXT:  	fmov	x13, d6
+; CHECK-BE-NEXT:  	mov	x10, v4.d[1]
+; CHECK-BE-NEXT:  	strh	w9, [x1, #136]
+; CHECK-BE-NEXT:  	fmov	x9, d4
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #30]
+; CHECK-BE-NEXT:  	ushll2	v5.2d, v0.4s, #0
+; CHECK-BE-NEXT:  	strh	w13, [x1, #64]
+; CHECK-BE-NEXT:  	ushll	v2.2d, v2.2s, #0
+; CHECK-BE-NEXT:  	str	wzr, [x1, #24]
+; CHECK-BE-NEXT:  	strh	w9, [x1, #52]
+; CHECK-BE-NEXT:  	fmov	x9, d5
+; CHECK-BE-NEXT:  	mov	x11, v2.d[1]
+; CHECK-BE-NEXT:  	strh	w10, [x1, #58]
+; CHECK-BE-NEXT:  	fmov	x10, d2
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #18]
+; CHECK-BE-NEXT:  	ushll	v2.2d, v3.2s, #0
+; CHECK-BE-NEXT:  	strh	w9, [x1, #16]
+; CHECK-BE-NEXT:  	ushll	v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:  	str	wzr, [x1, #12]
+; CHECK-BE-NEXT:  	mov	x9, v2.d[1]
+; CHECK-BE-NEXT:  	strh	w11, [x1, #34]
+; CHECK-BE-NEXT:  	ushll	v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:  	strh	w10, [x1, #28]
+; CHECK-BE-NEXT:  	mov	x12, v5.d[1]
+; CHECK-BE-NEXT:  	stur	wzr, [x1, #6]
+; CHECK-BE-NEXT:  	str	wzr, [x1]
+; CHECK-BE-NEXT:  	strh	w9, [x1, #130]
+; CHECK-BE-NEXT:  	fmov	x9, d2
+; CHECK-BE-NEXT:  	ushll2	v2.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	strh	w12, [x1, #22]
+; CHECK-BE-NEXT:  	strh	w9, [x1, #124]
+; CHECK-BE-NEXT:  	mov	x9, v2.d[1]
+; CHECK-BE-NEXT:  	strh	w9, [x1, #118]
+; CHECK-BE-NEXT:  	fmov	x9, d2
+; CHECK-BE-NEXT:  	strh	w9, [x1, #112]
+; CHECK-BE-NEXT:  	mov	x9, v0.d[1]
+; CHECK-BE-NEXT:  	strh	w9, [x1, #10]
+; CHECK-BE-NEXT:  	fmov	x9, d0
+; CHECK-BE-NEXT:  	ushll	v0.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	strh	w9, [x1, #4]
+; CHECK-BE-NEXT:  	mov	x9, v0.d[1]
+; CHECK-BE-NEXT:  	strh	w9, [x1, #106]
+; CHECK-BE-NEXT:  	fmov	x9, d0
+; CHECK-BE-NEXT:  	strh	w9, [x1, #100]
+; CHECK-BE-NEXT:  	add	x1, x1, #128
+; CHECK-BE-NEXT:  	b.ne	.LBB21_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <23 x i8>*
+  %load = load <23 x i8>, <23 x i8>* %src.gep.cast
+  %ext = zext <23 x i8> %load to <23 x i48>
+  %dst.gep = getelementptr i48, i48* %dst, i64 %iv
+  %dst.gep.cast = bitcast i48* %dst.gep to <23 x i48>*
+  store <23 x i48> %ext, <23 x i48>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @zext_v8i8_to_v8i33_in_loop(i8* %src, i33* %dst) {
+; CHECK-LABEL:  _zext_v8i8_to_v8i33_in_loop:
+; CHECK:  ; %bb.0:                                ; %entry
+; CHECK-NEXT: 	mov	x8, xzr
+; CHECK-NEXT: LBB22_1:                                ; %loop
+; CHECK-NEXT:                                         ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	ldr	d0, [x0, x8]
+; CHECK-NEXT: 	add	x8, x8, #16
+; CHECK-NEXT: 	cmp	x8, #128
+; CHECK-NEXT: 	strb	wzr, [x1, #32]
+; CHECK-NEXT: 	ushll.8h	v0, v0, #0
+; CHECK-NEXT: 	ushll.4s	v1, v0, #0
+; CHECK-NEXT: 	ushll2.4s	v0, v0, #0
+; CHECK-NEXT: 	ushll.2d	v2, v1, #0
+; CHECK-NEXT: 	ushll2.2d	v3, v0, #0
+; CHECK-NEXT: 	ushll2.2d	v1, v1, #0
+; CHECK-NEXT: 	mov.d	x9, v3[1]
+; CHECK-NEXT: 	fmov	x10, d3
+; CHECK-NEXT: 	mov.d	x12, v1[1]
+; CHECK-NEXT: 	ushll.2d	v0, v0, #0
+; CHECK-NEXT: 	lsl	x9, x9, #39
+; CHECK-NEXT: 	orr	x9, x9, x10, lsl #6
+; CHECK-NEXT: 	fmov	x10, d1
+; CHECK-NEXT: 	mov.d	x11, v0[1]
+; CHECK-NEXT: 	lsl	x12, x12, #35
+; CHECK-NEXT: 	mov.d	x14, v2[1]
+; CHECK-NEXT: 	fmov	x13, d0
+; CHECK-NEXT: 	orr	x10, x12, x10, lsl #2
+; CHECK-NEXT: 	fmov	x12, d2
+; CHECK-NEXT: 	lsl	x11, x11, #37
+; CHECK-NEXT: 	orr	x11, x11, x13, lsl #4
+; CHECK-NEXT: 	orr	x12, x12, x14, lsl #33
+; CHECK-NEXT: 	stp	x11, x9, [x1, #16]
+; CHECK-NEXT: 	stp	x12, x10, [x1], #128
+; CHECK-NEXT: 	b.ne	LBB22_1
+; CHECK-NEXT: ; %bb.2:                                ; %exit
+; CHECK-NEXT: 	ret
+
+; CHECK-BE-LABEL: zext_v8i8_to_v8i33_in_loop:
+; CHECK-BE:  // %bb.0:                               // %entry
+; CHECK-BE-NEXT:  	mov	x8, xzr
+; CHECK-BE-NEXT:  .LBB22_1:                               // %loop
+; CHECK-BE-NEXT:                                          // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:  	add	x9, x0, x8
+; CHECK-BE-NEXT:  	add	x8, x8, #16
+; CHECK-BE-NEXT:  	cmp	x8, #128
+; CHECK-BE-NEXT:  	ld1	{ v0.8b }, [x9]
+; CHECK-BE-NEXT:  	ushll	v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:  	ushll	v1.4s, v0.4h, #0
+; CHECK-BE-NEXT:  	ushll2	v0.4s, v0.8h, #0
+; CHECK-BE-NEXT:  	ushll	v2.2d, v1.2s, #0
+; CHECK-BE-NEXT:  	ushll2	v3.2d, v0.4s, #0
+; CHECK-BE-NEXT:  	ushll	v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:  	ushll2	v1.2d, v1.4s, #0
+; CHECK-BE-NEXT:  	mov	x9, v3.d[1]
+; CHECK-BE-NEXT:  	fmov	x10, d3
+; CHECK-BE-NEXT:  	mov	x11, v0.d[1]
+; CHECK-BE-NEXT:  	fmov	x12, d0
+; CHECK-BE-NEXT:  	mov	x13, v1.d[1]
+; CHECK-BE-NEXT:  	mov	x14, v2.d[1]
+; CHECK-BE-NEXT:  	orr	x10, x9, x10, lsl #33
+; CHECK-BE-NEXT:  	fmov	x15, d1
+; CHECK-BE-NEXT:  	strb	w9, [x1, #32]
+; CHECK-BE-NEXT:  	fmov	x16, d2
+; CHECK-BE-NEXT:  	lsl	x11, x11, #2
+; CHECK-BE-NEXT:  	lsl	x13, x13, #4
+; CHECK-BE-NEXT:  	orr	x12, x11, x12, lsl #35
+; CHECK-BE-NEXT:  	lsl	x14, x14, #6
+; CHECK-BE-NEXT:  	orr	x15, x13, x15, lsl #37
+; CHECK-BE-NEXT:  	extr	x10, x11, x10, #8
+; CHECK-BE-NEXT:  	orr	x11, x14, x16, lsl #39
+; CHECK-BE-NEXT:  	extr	x12, x13, x12, #8
+; CHECK-BE-NEXT:  	extr	x9, x14, x15, #8
+; CHECK-BE-NEXT:  	extr	x11, xzr, x11, #8
+; CHECK-BE-NEXT:  	stp	x12, x10, [x1, #16]
+; CHECK-BE-NEXT:  	stp	x11, x9, [x1], #128
+; CHECK-BE-NEXT:  	b.ne	.LBB22_1
+; CHECK-BE-NEXT:  // %bb.2:                               // %exit
+; CHECK-BE-NEXT:  	ret
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %src.gep = getelementptr i8, i8* %src, i64 %iv
+  %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
+  %load = load <8 x i8>, <8 x i8>* %src.gep.cast
+  %ext = zext <8 x i8> %load to <8 x i33>
+  %dst.gep = getelementptr i33, i33* %dst, i64 %iv
+  %dst.gep.cast = bitcast i33* %dst.gep to <8 x i33>*
+  store <8 x i33> %ext, <8 x i33>* %dst.gep.cast
+  %iv.next = add nuw i64 %iv, 16
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}


        


More information about the llvm-commits mailing list