[llvm] 3b65752 - [AArch64] Enabled and regenerate clmul-fixed.ll. NFC (#184628)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 5 00:32:47 PST 2026
Author: David Green
Date: 2026-03-05T08:32:42Z
New Revision: 3b657524b69f49165377ff507f0d024633bfa37b
URL: https://github.com/llvm/llvm-project/commit/3b657524b69f49165377ff507f0d024633bfa37b
DIFF: https://github.com/llvm/llvm-project/commit/3b657524b69f49165377ff507f0d024633bfa37b.diff
LOG: [AArch64] Enabled and regenerate clmul-fixed.ll. NFC (#184628)
The v2i64 tests are now fixed. The disabled ones in clmul-scalable.ll
require i128 vectors which are generally not supported.
Added:
Modified:
llvm/test/CodeGen/AArch64/clmul-fixed.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
index 6dbc0b4a70f37..37b7a26a8bbfb 100644
--- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
@@ -432,16 +432,1229 @@ define <2 x i32> @clmul_v2i32_neon(<2 x i32> %x, <2 x i32> %y) {
ret <2 x i32> %a
}
-; TODO: Fix
-; define <2 x i64> @clmul_v2i64_neon(<2 x i64> %x, <2 x i64> %y) {
-; %a = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %x, <2 x i64> %y)
-; ret <2 x i64> %a
-; }
-; TODO: Fix
-; define <1 x i64> @clmul_v1i64_neon(<1 x i64> %x, <1 x i64> %y) {
-; %a = call <1 x i64> @llvm.clmul.v1i64(<1 x i64> %x, <1 x i64> %y)
-; ret <1 x i64> %a
-; }
+define <2 x i64> @clmul_v2i64_neon(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-NEON-LABEL: clmul_v2i64_neon:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: sub sp, sp, #416
+; CHECK-NEON-NEXT: stp d13, d12, [sp, #272] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp d11, d10, [sp, #288] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp d9, d8, [sp, #304] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp x29, x30, [sp, #320] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp x28, x27, [sp, #336] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp x26, x25, [sp, #352] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp x24, x23, [sp, #368] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp x22, x21, [sp, #384] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp x20, x19, [sp, #400] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: .cfi_def_cfa_offset 416
+; CHECK-NEON-NEXT: .cfi_offset w19, -8
+; CHECK-NEON-NEXT: .cfi_offset w20, -16
+; CHECK-NEON-NEXT: .cfi_offset w21, -24
+; CHECK-NEON-NEXT: .cfi_offset w22, -32
+; CHECK-NEON-NEXT: .cfi_offset w23, -40
+; CHECK-NEON-NEXT: .cfi_offset w24, -48
+; CHECK-NEON-NEXT: .cfi_offset w25, -56
+; CHECK-NEON-NEXT: .cfi_offset w26, -64
+; CHECK-NEON-NEXT: .cfi_offset w27, -72
+; CHECK-NEON-NEXT: .cfi_offset w28, -80
+; CHECK-NEON-NEXT: .cfi_offset w30, -88
+; CHECK-NEON-NEXT: .cfi_offset w29, -96
+; CHECK-NEON-NEXT: .cfi_offset b8, -104
+; CHECK-NEON-NEXT: .cfi_offset b9, -112
+; CHECK-NEON-NEXT: .cfi_offset b10, -120
+; CHECK-NEON-NEXT: .cfi_offset b11, -128
+; CHECK-NEON-NEXT: .cfi_offset b12, -136
+; CHECK-NEON-NEXT: .cfi_offset b13, -144
+; CHECK-NEON-NEXT: mov w8, #2 // =0x2
+; CHECK-NEON-NEXT: fmov x9, d0
+; CHECK-NEON-NEXT: mov w10, #8 // =0x8
+; CHECK-NEON-NEXT: dup v2.2d, x8
+; CHECK-NEON-NEXT: mov w8, #1 // =0x1
+; CHECK-NEON-NEXT: mov w14, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT: dup v3.2d, x8
+; CHECK-NEON-NEXT: mov w8, #4 // =0x4
+; CHECK-NEON-NEXT: mov x3, #4294967296 // =0x100000000
+; CHECK-NEON-NEXT: dup v4.2d, x8
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: fmov x8, d2
+; CHECK-NEON-NEXT: mov x12, v2.d[1]
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: mov x13, v4.d[1]
+; CHECK-NEON-NEXT: mul x16, x9, x8
+; CHECK-NEON-NEXT: mov x8, v0.d[1]
+; CHECK-NEON-NEXT: dup v0.2d, x10
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: mul x23, x9, x11
+; CHECK-NEON-NEXT: mov x11, v3.d[1]
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: mul x5, x9, x10
+; CHECK-NEON-NEXT: mov w10, #16 // =0x10
+; CHECK-NEON-NEXT: dup v2.2d, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: fmov d5, x23
+; CHECK-NEON-NEXT: mov x23, #17179869184 // =0x400000000
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: mul x27, x8, x12
+; CHECK-NEON-NEXT: mov w12, #64 // =0x40
+; CHECK-NEON-NEXT: fmov d16, x5
+; CHECK-NEON-NEXT: mov x5, #8589934592 // =0x200000000
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: str x11, [sp, #248] // 8-byte Spill
+; CHECK-NEON-NEXT: mov w11, #32 // =0x20
+; CHECK-NEON-NEXT: dup v3.2d, x11
+; CHECK-NEON-NEXT: mul x11, x8, x13
+; CHECK-NEON-NEXT: mov w13, #268435456 // =0x10000000
+; CHECK-NEON-NEXT: stp x10, x11, [sp, #256] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #128 // =0x80
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: mul x29, x8, x10
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #232] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #256 // =0x100
+; CHECK-NEON-NEXT: ldr d19, [sp, #232] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: str x10, [sp, #240] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #208] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #512 // =0x200
+; CHECK-NEON-NEXT: ldr d17, [sp, #208] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mov v17.d[1], x29
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: str x10, [sp, #216] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #184] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #1024 // =0x400
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: str x10, [sp, #224] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #176] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #2048 // =0x800
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mul x22, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #4096 // =0x1000
+; CHECK-NEON-NEXT: str x10, [sp, #192] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: fmov d22, x22
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: str x11, [sp, #160] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #8192 // =0x2000
+; CHECK-NEON-NEXT: ldr d28, [sp, #160] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: str x10, [sp, #200] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: str x11, [sp, #136] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #16384 // =0x4000
+; CHECK-NEON-NEXT: ldr d23, [sp, #136] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: str x10, [sp, #168] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #112] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #32768 // =0x8000
+; CHECK-NEON-NEXT: ldr d7, [sp, #112] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x28, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: str x10, [sp, #144] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mul x18, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: mov w12, #65536 // =0x10000
+; CHECK-NEON-NEXT: fmov d27, x28
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: fmov d30, x18
+; CHECK-NEON-NEXT: str x10, [sp, #152] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #131072 // =0x20000
+; CHECK-NEON-NEXT: str x11, [sp, #96] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: ldr d25, [sp, #96] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: str x10, [sp, #120] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #262144 // =0x40000
+; CHECK-NEON-NEXT: str x11, [sp, #72] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: ldr d21, [sp, #72] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: mul x6, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: str x10, [sp, #128] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: dup v0.2d, x12
+; CHECK-NEON-NEXT: mov w12, #524288 // =0x80000
+; CHECK-NEON-NEXT: mul x26, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: fmov d24, x26
+; CHECK-NEON-NEXT: str x10, [sp, #104] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #1048576 // =0x100000
+; CHECK-NEON-NEXT: str x11, [sp, #40] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: ldr d29, [sp, #40] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: str x10, [sp, #80] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #2097152 // =0x200000
+; CHECK-NEON-NEXT: str x11, [sp, #24] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: ldr d26, [sp, #24] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mul x25, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: str x10, [sp, #88] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: dup v0.2d, x12
+; CHECK-NEON-NEXT: mov w12, #4194304 // =0x400000
+; CHECK-NEON-NEXT: mul x19, x9, x11
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: str x10, [sp, #56] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #8388608 // =0x800000
+; CHECK-NEON-NEXT: mul x0, x9, x11
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: str x10, [sp, #64] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #16777216 // =0x1000000
+; CHECK-NEON-NEXT: mul x2, x9, x11
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: str x10, [sp, #48] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: dup v0.2d, x12
+; CHECK-NEON-NEXT: mov w12, #33554432 // =0x2000000
+; CHECK-NEON-NEXT: mul x1, x9, x11
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: mul x30, x8, x10
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #67108864 // =0x4000000
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mul x15, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: str x10, [sp, #8] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #134217728 // =0x8000000
+; CHECK-NEON-NEXT: mul x17, x9, x11
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: mul x20, x8, x10
+; CHECK-NEON-NEXT: mov x10, v0.d[1]
+; CHECK-NEON-NEXT: dup v0.2d, x12
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: mul x4, x8, x10
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x13
+; CHECK-NEON-NEXT: mov x11, v0.d[1]
+; CHECK-NEON-NEXT: fmov x13, d0
+; CHECK-NEON-NEXT: dup v0.2d, x14
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: mul x24, x8, x10
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: and v6.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: fmov d0, x16
+; CHECK-NEON-NEXT: mul x13, x9, x13
+; CHECK-NEON-NEXT: mul x7, x8, x10
+; CHECK-NEON-NEXT: mov w10, #536870912 // =0x20000000
+; CHECK-NEON-NEXT: mov v0.d[1], x27
+; CHECK-NEON-NEXT: dup v4.2d, x10
+; CHECK-NEON-NEXT: fmov x10, d3
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: mul x12, x9, x10
+; CHECK-NEON-NEXT: movi v4.4s, #128, lsl #24
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: mov x11, v2.d[1]
+; CHECK-NEON-NEXT: mov x14, v3.d[1]
+; CHECK-NEON-NEXT: fmov x16, d3
+; CHECK-NEON-NEXT: dup v3.2d, x3
+; CHECK-NEON-NEXT: fneg v4.2d, v4.2d
+; CHECK-NEON-NEXT: mul x21, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: str x10, [sp, #16] // 8-byte Spill
+; CHECK-NEON-NEXT: mul x10, x8, x14
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: mov x14, v6.d[1]
+; CHECK-NEON-NEXT: and v4.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x5
+; CHECK-NEON-NEXT: mul x3, x9, x16
+; CHECK-NEON-NEXT: mov x16, v2.d[1]
+; CHECK-NEON-NEXT: fmov x5, d2
+; CHECK-NEON-NEXT: dup v2.2d, x23
+; CHECK-NEON-NEXT: mul x27, x8, x14
+; CHECK-NEON-NEXT: fmov x14, d6
+; CHECK-NEON-NEXT: and v6.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: str x10, [sp, #32] // 8-byte Spill
+; CHECK-NEON-NEXT: ldr x10, [sp, #248] // 8-byte Reload
+; CHECK-NEON-NEXT: mov x23, #34359738368 // =0x800000000
+; CHECK-NEON-NEXT: ldp d3, d18, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: mov v5.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #264] // 8-byte Reload
+; CHECK-NEON-NEXT: mov x22, v6.d[1]
+; CHECK-NEON-NEXT: mul x11, x9, x11
+; CHECK-NEON-NEXT: mov v16.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #256] // 8-byte Reload
+; CHECK-NEON-NEXT: fmov x18, d2
+; CHECK-NEON-NEXT: mul x14, x9, x14
+; CHECK-NEON-NEXT: mov v19.d[1], x10
+; CHECK-NEON-NEXT: eor v13.16b, v5.16b, v0.16b
+; CHECK-NEON-NEXT: fmov d0, x12
+; CHECK-NEON-NEXT: mul x10, x8, x16
+; CHECK-NEON-NEXT: mov x16, v4.d[1]
+; CHECK-NEON-NEXT: mov x12, #8796093022208 // =0x80000000000
+; CHECK-NEON-NEXT: fmov d5, x17
+; CHECK-NEON-NEXT: mov v0.d[1], x7
+; CHECK-NEON-NEXT: mul x5, x9, x5
+; CHECK-NEON-NEXT: eor v16.16b, v16.16b, v19.16b
+; CHECK-NEON-NEXT: dup v19.2d, x12
+; CHECK-NEON-NEXT: mov v5.d[1], x24
+; CHECK-NEON-NEXT: mul x18, x9, x18
+; CHECK-NEON-NEXT: str x10, [sp, #256] // 8-byte Spill
+; CHECK-NEON-NEXT: ldr x10, [sp, #240] // 8-byte Reload
+; CHECK-NEON-NEXT: and v19.16b, v1.16b, v19.16b
+; CHECK-NEON-NEXT: mov v18.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #216] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v3.d[1], x10
+; CHECK-NEON-NEXT: mul x10, x8, x16
+; CHECK-NEON-NEXT: fmov x16, d4
+; CHECK-NEON-NEXT: dup v4.2d, x23
+; CHECK-NEON-NEXT: fmov x23, d6
+; CHECK-NEON-NEXT: mul x16, x9, x16
+; CHECK-NEON-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: str x10, [sp, #232] // 8-byte Spill
+; CHECK-NEON-NEXT: ldr x10, [sp, #224] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x28, x9, x23
+; CHECK-NEON-NEXT: mov v22.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #192] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v28.d[1], x10
+; CHECK-NEON-NEXT: mul x10, x8, x22
+; CHECK-NEON-NEXT: mov x22, v2.d[1]
+; CHECK-NEON-NEXT: str x10, [sp, #264] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, #68719476736 // =0x1000000000
+; CHECK-NEON-NEXT: dup v6.2d, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #200] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v23.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #168] // 8-byte Reload
+; CHECK-NEON-NEXT: and v20.16b, v1.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d6, x6
+; CHECK-NEON-NEXT: fmov x6, d4
+; CHECK-NEON-NEXT: mov v7.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #144] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v27.d[1], x10
+; CHECK-NEON-NEXT: mul x10, x8, x22
+; CHECK-NEON-NEXT: mov x22, #137438953472 // =0x2000000000
+; CHECK-NEON-NEXT: dup v2.2d, x22
+; CHECK-NEON-NEXT: ldr x22, [sp, #152] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x26, x9, x6
+; CHECK-NEON-NEXT: ldr x6, [sp, #128] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v30.d[1], x22
+; CHECK-NEON-NEXT: ldr x22, [sp, #120] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v21.d[1], x6
+; CHECK-NEON-NEXT: ldr x6, [sp, #104] // 8-byte Reload
+; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: str x10, [sp, #248] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v4.d[1]
+; CHECK-NEON-NEXT: mov v25.d[1], x22
+; CHECK-NEON-NEXT: mov v6.d[1], x6
+; CHECK-NEON-NEXT: ldr x6, [sp, #80] // 8-byte Reload
+; CHECK-NEON-NEXT: mov x22, #274877906944 // =0x4000000000
+; CHECK-NEON-NEXT: dup v4.2d, x22
+; CHECK-NEON-NEXT: mov x22, #549755813888 // =0x8000000000
+; CHECK-NEON-NEXT: mov v24.d[1], x6
+; CHECK-NEON-NEXT: fmov x6, d20
+; CHECK-NEON-NEXT: dup v8.2d, x22
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: and v31.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: fmov d4, x19
+; CHECK-NEON-NEXT: mov x19, #1099511627776 // =0x10000000000
+; CHECK-NEON-NEXT: mul x22, x9, x6
+; CHECK-NEON-NEXT: ldr x6, [sp, #88] // 8-byte Reload
+; CHECK-NEON-NEXT: and v10.16b, v1.16b, v8.16b
+; CHECK-NEON-NEXT: dup v9.2d, x19
+; CHECK-NEON-NEXT: fmov d8, x2
+; CHECK-NEON-NEXT: mov v29.d[1], x6
+; CHECK-NEON-NEXT: ldr x6, [sp, #56] // 8-byte Reload
+; CHECK-NEON-NEXT: fmov x2, d31
+; CHECK-NEON-NEXT: str x10, [sp, #240] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v20.d[1]
+; CHECK-NEON-NEXT: fmov d20, x25
+; CHECK-NEON-NEXT: mov v26.d[1], x6
+; CHECK-NEON-NEXT: fmov x6, d2
+; CHECK-NEON-NEXT: and v11.16b, v1.16b, v9.16b
+; CHECK-NEON-NEXT: fmov d9, x1
+; CHECK-NEON-NEXT: mul x1, x9, x2
+; CHECK-NEON-NEXT: ldr x2, [sp, #8] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mov v8.d[1], x2
+; CHECK-NEON-NEXT: mov x2, #4398046511104 // =0x40000000000
+; CHECK-NEON-NEXT: fmov x12, d11
+; CHECK-NEON-NEXT: mov v9.d[1], x20
+; CHECK-NEON-NEXT: mul x12, x9, x12
+; CHECK-NEON-NEXT: str x10, [sp, #216] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: fmov d2, x0
+; CHECK-NEON-NEXT: mul x0, x9, x6
+; CHECK-NEON-NEXT: ldr x6, [sp, #64] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v20.d[1], x6
+; CHECK-NEON-NEXT: ldr x6, [sp, #48] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v2.d[1], x30
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mov v4.d[1], x6
+; CHECK-NEON-NEXT: mov x6, #2199023255552 // =0x20000000000
+; CHECK-NEON-NEXT: dup v12.2d, x6
+; CHECK-NEON-NEXT: str x10, [sp, #224] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v31.d[1]
+; CHECK-NEON-NEXT: fmov d31, x15
+; CHECK-NEON-NEXT: mov x15, v11.d[1]
+; CHECK-NEON-NEXT: mov v31.d[1], x4
+; CHECK-NEON-NEXT: mov x4, #17592186044416 // =0x100000000000
+; CHECK-NEON-NEXT: mul x29, x8, x10
+; CHECK-NEON-NEXT: mov x10, v10.d[1]
+; CHECK-NEON-NEXT: mul x6, x8, x15
+; CHECK-NEON-NEXT: mul x25, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d10
+; CHECK-NEON-NEXT: and v10.16b, v1.16b, v12.16b
+; CHECK-NEON-NEXT: dup v12.2d, x2
+; CHECK-NEON-NEXT: mul x2, x9, x10
+; CHECK-NEON-NEXT: mov x10, v10.d[1]
+; CHECK-NEON-NEXT: and v11.16b, v1.16b, v12.16b
+; CHECK-NEON-NEXT: eor v12.16b, v13.16b, v16.16b
+; CHECK-NEON-NEXT: eor v16.16b, v17.16b, v18.16b
+; CHECK-NEON-NEXT: eor v17.16b, v22.16b, v28.16b
+; CHECK-NEON-NEXT: dup v18.2d, x4
+; CHECK-NEON-NEXT: mov x4, #70368744177664 // =0x400000000000
+; CHECK-NEON-NEXT: eor v22.16b, v16.16b, v3.16b
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: mov x11, #35184372088832 // =0x200000000000
+; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v23.16b
+; CHECK-NEON-NEXT: eor v23.16b, v27.16b, v30.16b
+; CHECK-NEON-NEXT: dup v27.2d, x11
+; CHECK-NEON-NEXT: fmov x11, d11
+; CHECK-NEON-NEXT: mul x17, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d10
+; CHECK-NEON-NEXT: fmov d16, x13
+; CHECK-NEON-NEXT: and v18.16b, v1.16b, v18.16b
+; CHECK-NEON-NEXT: eor v22.16b, v12.16b, v22.16b
+; CHECK-NEON-NEXT: eor v7.16b, v17.16b, v7.16b
+; CHECK-NEON-NEXT: eor v17.16b, v23.16b, v25.16b
+; CHECK-NEON-NEXT: dup v23.2d, x4
+; CHECK-NEON-NEXT: mul x20, x9, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #16] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v3.d[1], x21
+; CHECK-NEON-NEXT: mov x21, #281474976710656 // =0x1000000000000
+; CHECK-NEON-NEXT: mov x15, v11.d[1]
+; CHECK-NEON-NEXT: mul x7, x9, x10
+; CHECK-NEON-NEXT: mov x10, v19.d[1]
+; CHECK-NEON-NEXT: mov v16.d[1], x11
+; CHECK-NEON-NEXT: mov x11, v18.d[1]
+; CHECK-NEON-NEXT: eor v22.16b, v22.16b, v7.16b
+; CHECK-NEON-NEXT: fmov d7, x3
+; CHECK-NEON-NEXT: mov x3, #140737488355328 // =0x800000000000
+; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v21.16b
+; CHECK-NEON-NEXT: eor v21.16b, v24.16b, v29.16b
+; CHECK-NEON-NEXT: dup v25.2d, x3
+; CHECK-NEON-NEXT: fmov d24, x14
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: ldp d11, d10, [sp, #288] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x13, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d19
+; CHECK-NEON-NEXT: and v19.16b, v1.16b, v27.16b
+; CHECK-NEON-NEXT: eor v6.16b, v17.16b, v6.16b
+; CHECK-NEON-NEXT: eor v17.16b, v21.16b, v26.16b
+; CHECK-NEON-NEXT: eor v21.16b, v8.16b, v9.16b
+; CHECK-NEON-NEXT: mul x23, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d18
+; CHECK-NEON-NEXT: and v18.16b, v1.16b, v23.16b
+; CHECK-NEON-NEXT: mov v24.d[1], x27
+; CHECK-NEON-NEXT: mov x27, #1125899906842624 // =0x4000000000000
+; CHECK-NEON-NEXT: dup v23.2d, x21
+; CHECK-NEON-NEXT: mul x4, x9, x10
+; CHECK-NEON-NEXT: mov x10, v19.d[1]
+; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v20.16b
+; CHECK-NEON-NEXT: eor v20.16b, v21.16b, v31.16b
+; CHECK-NEON-NEXT: fmov d21, x5
+; CHECK-NEON-NEXT: eor v6.16b, v22.16b, v6.16b
+; CHECK-NEON-NEXT: mul x24, x9, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #32] // 8-byte Reload
+; CHECK-NEON-NEXT: fmov d22, x16
+; CHECK-NEON-NEXT: mov x5, #562949953421312 // =0x2000000000000
+; CHECK-NEON-NEXT: eor v4.16b, v17.16b, v4.16b
+; CHECK-NEON-NEXT: dup v17.2d, x27
+; CHECK-NEON-NEXT: mov v7.d[1], x11
+; CHECK-NEON-NEXT: mov x11, v18.d[1]
+; CHECK-NEON-NEXT: mul x19, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d19
+; CHECK-NEON-NEXT: and v19.16b, v1.16b, v25.16b
+; CHECK-NEON-NEXT: eor v5.16b, v20.16b, v5.16b
+; CHECK-NEON-NEXT: dup v25.2d, x5
+; CHECK-NEON-NEXT: eor v2.16b, v4.16b, v2.16b
+; CHECK-NEON-NEXT: mov x27, #2251799813685248 // =0x8000000000000
+; CHECK-NEON-NEXT: dup v20.2d, x27
+; CHECK-NEON-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEON-NEXT: mul x3, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d18
+; CHECK-NEON-NEXT: eor v7.16b, v7.16b, v24.16b
+; CHECK-NEON-NEXT: eor v0.16b, v5.16b, v0.16b
+; CHECK-NEON-NEXT: fmov d5, x28
+; CHECK-NEON-NEXT: and v18.16b, v1.16b, v23.16b
+; CHECK-NEON-NEXT: mul x30, x9, x10
+; CHECK-NEON-NEXT: mov x10, v19.d[1]
+; CHECK-NEON-NEXT: eor v2.16b, v6.16b, v2.16b
+; CHECK-NEON-NEXT: fmov d6, x26
+; CHECK-NEON-NEXT: ldp d9, d8, [sp, #304] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x16, x9, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #256] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: fmov x27, d18
+; CHECK-NEON-NEXT: and v16.16b, v1.16b, v20.16b
+; CHECK-NEON-NEXT: mov x21, v18.d[1]
+; CHECK-NEON-NEXT: mov v21.d[1], x11
+; CHECK-NEON-NEXT: mul x5, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d19
+; CHECK-NEON-NEXT: ldr x11, [sp, #232] // 8-byte Reload
+; CHECK-NEON-NEXT: and v19.16b, v1.16b, v25.16b
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: mul x28, x9, x27
+; CHECK-NEON-NEXT: ldp d13, d12, [sp, #272] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mov v22.d[1], x11
+; CHECK-NEON-NEXT: mov x11, v17.d[1]
+; CHECK-NEON-NEXT: mul x14, x9, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #264] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v4.16b, v7.16b, v21.16b
+; CHECK-NEON-NEXT: fmov d7, x18
+; CHECK-NEON-NEXT: mov x18, v19.d[1]
+; CHECK-NEON-NEXT: eor v0.16b, v2.16b, v0.16b
+; CHECK-NEON-NEXT: mov v5.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #248] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x21, x8, x21
+; CHECK-NEON-NEXT: eor v4.16b, v4.16b, v22.16b
+; CHECK-NEON-NEXT: mov v7.d[1], x10
+; CHECK-NEON-NEXT: fmov x10, d19
+; CHECK-NEON-NEXT: mul x27, x8, x18
+; CHECK-NEON-NEXT: mov x18, #4503599627370496 // =0x10000000000000
+; CHECK-NEON-NEXT: eor v4.16b, v4.16b, v5.16b
+; CHECK-NEON-NEXT: fmov d5, x22
+; CHECK-NEON-NEXT: mov x22, #9007199254740992 // =0x20000000000000
+; CHECK-NEON-NEXT: mul x26, x9, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #240] // 8-byte Reload
+; CHECK-NEON-NEXT: dup v3.2d, x18
+; CHECK-NEON-NEXT: mov v6.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #216] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x18, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d17
+; CHECK-NEON-NEXT: dup v17.2d, x22
+; CHECK-NEON-NEXT: eor v2.16b, v4.16b, v7.16b
+; CHECK-NEON-NEXT: mov v5.d[1], x10
+; CHECK-NEON-NEXT: mov x10, v16.d[1]
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: fmov d4, x0
+; CHECK-NEON-NEXT: fmov d7, x1
+; CHECK-NEON-NEXT: mul x22, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d16
+; CHECK-NEON-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v6.16b
+; CHECK-NEON-NEXT: mov x1, v3.d[1]
+; CHECK-NEON-NEXT: fmov d16, x2
+; CHECK-NEON-NEXT: mul x0, x8, x10
+; CHECK-NEON-NEXT: mov x10, #18014398509481984 // =0x40000000000000
+; CHECK-NEON-NEXT: mov v7.d[1], x29
+; CHECK-NEON-NEXT: dup v6.2d, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #224] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x2, x9, x11
+; CHECK-NEON-NEXT: mov x11, v17.d[1]
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v5.16b
+; CHECK-NEON-NEXT: mov v4.d[1], x10
+; CHECK-NEON-NEXT: fmov x10, d3
+; CHECK-NEON-NEXT: fmov d3, x12
+; CHECK-NEON-NEXT: mov x12, #36028797018963968 // =0x80000000000000
+; CHECK-NEON-NEXT: mov v16.d[1], x25
+; CHECK-NEON-NEXT: and v6.16b, v1.16b, v6.16b
+; CHECK-NEON-NEXT: dup v5.2d, x12
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v7.16b
+; CHECK-NEON-NEXT: mul x1, x8, x1
+; CHECK-NEON-NEXT: mov v3.d[1], x6
+; CHECK-NEON-NEXT: mul x12, x9, x10
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: fmov d4, x7
+; CHECK-NEON-NEXT: fmov x10, d17
+; CHECK-NEON-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEON-NEXT: mov x25, v6.d[1]
+; CHECK-NEON-NEXT: mul x6, x8, x11
+; CHECK-NEON-NEXT: mov x11, #72057594037927936 // =0x100000000000000
+; CHECK-NEON-NEXT: fmov d17, x20
+; CHECK-NEON-NEXT: dup v7.2d, x11
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: mov v4.d[1], x17
+; CHECK-NEON-NEXT: mul x7, x9, x10
+; CHECK-NEON-NEXT: fmov d16, x4
+; CHECK-NEON-NEXT: mov x10, v5.d[1]
+; CHECK-NEON-NEXT: mov x17, #144115188075855872 // =0x200000000000000
+; CHECK-NEON-NEXT: fmov x11, d6
+; CHECK-NEON-NEXT: mov x4, #288230376151711744 // =0x400000000000000
+; CHECK-NEON-NEXT: and v6.16b, v1.16b, v7.16b
+; CHECK-NEON-NEXT: dup v7.2d, x17
+; CHECK-NEON-NEXT: mov v17.d[1], x15
+; CHECK-NEON-NEXT: mov v16.d[1], x13
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT: mul x13, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d5
+; CHECK-NEON-NEXT: fmov d3, x24
+; CHECK-NEON-NEXT: and v5.16b, v1.16b, v7.16b
+; CHECK-NEON-NEXT: dup v7.2d, x4
+; CHECK-NEON-NEXT: fmov d4, x30
+; CHECK-NEON-NEXT: mul x17, x9, x11
+; CHECK-NEON-NEXT: mov x11, v6.d[1]
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v17.16b
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: fmov d16, x16
+; CHECK-NEON-NEXT: mov v3.d[1], x23
+; CHECK-NEON-NEXT: mul x4, x9, x10
+; CHECK-NEON-NEXT: mov x16, v5.d[1]
+; CHECK-NEON-NEXT: fmov x10, d6
+; CHECK-NEON-NEXT: and v6.16b, v1.16b, v7.16b
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: mov v4.d[1], x19
+; CHECK-NEON-NEXT: mov v16.d[1], x3
+; CHECK-NEON-NEXT: mov x3, #1152921504606846976 // =0x1000000000000000
+; CHECK-NEON-NEXT: mov x19, #576460752303423488 // =0x800000000000000
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT: dup v17.2d, x19
+; CHECK-NEON-NEXT: mov x20, #2305843009213693952 // =0x2000000000000000
+; CHECK-NEON-NEXT: mov v7.d[1], x5
+; CHECK-NEON-NEXT: mov x5, v6.d[1]
+; CHECK-NEON-NEXT: mul x14, x8, x16
+; CHECK-NEON-NEXT: fmov x16, d5
+; CHECK-NEON-NEXT: fmov d5, x28
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: dup v4.2d, x3
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v16.16b
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v17.16b
+; CHECK-NEON-NEXT: movi v17.2d, #0000000000000000
+; CHECK-NEON-NEXT: dup v16.2d, x20
+; CHECK-NEON-NEXT: mul x15, x8, x25
+; CHECK-NEON-NEXT: mov v5.d[1], x21
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v7.16b
+; CHECK-NEON-NEXT: fmov d7, x26
+; CHECK-NEON-NEXT: mul x3, x9, x16
+; CHECK-NEON-NEXT: mov x19, v3.d[1]
+; CHECK-NEON-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: fmov x20, d3
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v16.16b
+; CHECK-NEON-NEXT: mul x16, x8, x5
+; CHECK-NEON-NEXT: fmov x5, d6
+; CHECK-NEON-NEXT: fmov d6, x22
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v5.16b
+; CHECK-NEON-NEXT: mov v7.d[1], x27
+; CHECK-NEON-NEXT: fmov v5.2d, #2.00000000
+; CHECK-NEON-NEXT: fmov x21, d3
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mov v6.d[1], x18
+; CHECK-NEON-NEXT: mov x18, v4.d[1]
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: ldp x24, x23, [sp, #368] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v7.16b
+; CHECK-NEON-NEXT: fneg v7.2d, v17.2d
+; CHECK-NEON-NEXT: mul x5, x9, x5
+; CHECK-NEON-NEXT: ldp x26, x25, [sp, #352] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d6, x2
+; CHECK-NEON-NEXT: fmov x2, d4
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mul x20, x9, x20
+; CHECK-NEON-NEXT: ldp x28, x27, [sp, #336] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mov v6.d[1], x0
+; CHECK-NEON-NEXT: mov x0, v3.d[1]
+; CHECK-NEON-NEXT: and v3.16b, v1.16b, v5.16b
+; CHECK-NEON-NEXT: fmov d5, x7
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v7.16b
+; CHECK-NEON-NEXT: mov v4.d[1], x1
+; CHECK-NEON-NEXT: mul x2, x9, x2
+; CHECK-NEON-NEXT: ldp x29, x30, [sp, #320] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mov x1, v3.d[1]
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d6, x4
+; CHECK-NEON-NEXT: mov v5.d[1], x6
+; CHECK-NEON-NEXT: fmov x6, d3
+; CHECK-NEON-NEXT: fmov d3, x17
+; CHECK-NEON-NEXT: mul x12, x9, x21
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v4.16b
+; CHECK-NEON-NEXT: fmov d4, x3
+; CHECK-NEON-NEXT: mov v6.d[1], x13
+; CHECK-NEON-NEXT: fmov x13, d1
+; CHECK-NEON-NEXT: mul x19, x8, x19
+; CHECK-NEON-NEXT: mov v3.d[1], x15
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v5.16b
+; CHECK-NEON-NEXT: fmov d5, x20
+; CHECK-NEON-NEXT: mul x17, x9, x6
+; CHECK-NEON-NEXT: mov v4.d[1], x14
+; CHECK-NEON-NEXT: ldp x22, x21, [sp, #384] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x9, x9, x13
+; CHECK-NEON-NEXT: mov x13, v1.d[1]
+; CHECK-NEON-NEXT: fmov d1, x10
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT: fmov d3, x5
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v6.16b
+; CHECK-NEON-NEXT: mul x18, x8, x18
+; CHECK-NEON-NEXT: mov v5.d[1], x19
+; CHECK-NEON-NEXT: mov v1.d[1], x11
+; CHECK-NEON-NEXT: ldp x20, x19, [sp, #400] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x10, x8, x0
+; CHECK-NEON-NEXT: mov v3.d[1], x16
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mul x11, x8, x1
+; CHECK-NEON-NEXT: eor v1.16b, v2.16b, v1.16b
+; CHECK-NEON-NEXT: fmov d2, x2
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v5.16b
+; CHECK-NEON-NEXT: mul x8, x8, x13
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: mov v4.d[1], x10
+; CHECK-NEON-NEXT: mov v2.d[1], x18
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: fmov d3, x17
+; CHECK-NEON-NEXT: mov v3.d[1], x11
+; CHECK-NEON-NEXT: mov v5.d[1], x8
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v5.16b
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: add sp, sp, #416
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-AES-LABEL: clmul_v2i64_neon:
+; CHECK-AES: // %bb.0:
+; CHECK-AES-NEXT: pmull2 v2.1q, v0.2d, v1.2d
+; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-AES-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-AES-NEXT: ret
+ %a = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %x, <2 x i64> %y)
+ ret <2 x i64> %a
+}
+
+define <1 x i64> @clmul_v1i64_neon(<1 x i64> %x, <1 x i64> %y) {
+; CHECK-NEON-LABEL: clmul_v1i64_neon:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #2 // =0x2
+; CHECK-NEON-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEON-NEXT: mov w9, #4 // =0x4
+; CHECK-NEON-NEXT: mov w10, #8 // =0x8
+; CHECK-NEON-NEXT: fmov d2, x8
+; CHECK-NEON-NEXT: mov w8, #1 // =0x1
+; CHECK-NEON-NEXT: fmov d3, x8
+; CHECK-NEON-NEXT: fmov x8, d0
+; CHECK-NEON-NEXT: fmov d0, x9
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: and v3.8b, v1.8b, v3.8b
+; CHECK-NEON-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT: fmov x9, d2
+; CHECK-NEON-NEXT: fmov d2, x10
+; CHECK-NEON-NEXT: mov w10, #16 // =0x10
+; CHECK-NEON-NEXT: mul x14, x8, x9
+; CHECK-NEON-NEXT: fmov x9, d3
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mul x15, x8, x9
+; CHECK-NEON-NEXT: fmov x9, d0
+; CHECK-NEON-NEXT: fmov d0, x10
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: fmov d3, x14
+; CHECK-NEON-NEXT: mul x12, x8, x9
+; CHECK-NEON-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT: mov w9, #32 // =0x20
+; CHECK-NEON-NEXT: fmov d2, x9
+; CHECK-NEON-NEXT: mov w9, #64 // =0x40
+; CHECK-NEON-NEXT: mul x11, x8, x10
+; CHECK-NEON-NEXT: fmov d4, x15
+; CHECK-NEON-NEXT: fmov x10, d0
+; CHECK-NEON-NEXT: fmov d0, x9
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: eor v3.8b, v4.8b, v3.8b
+; CHECK-NEON-NEXT: mul x9, x8, x10
+; CHECK-NEON-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT: mov w10, #128 // =0x80
+; CHECK-NEON-NEXT: fmov x13, d2
+; CHECK-NEON-NEXT: fmov d2, x10
+; CHECK-NEON-NEXT: mov w10, #256 // =0x100
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: fmov x14, d0
+; CHECK-NEON-NEXT: fmov d0, x10
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: mul x10, x8, x14
+; CHECK-NEON-NEXT: mov w14, #512 // =0x200
+; CHECK-NEON-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT: fmov x15, d2
+; CHECK-NEON-NEXT: fmov d2, x14
+; CHECK-NEON-NEXT: fmov x12, d0
+; CHECK-NEON-NEXT: mul x14, x8, x15
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mov w15, #1024 // =0x400
+; CHECK-NEON-NEXT: fmov d0, x15
+; CHECK-NEON-NEXT: mov w15, #2048 // =0x800
+; CHECK-NEON-NEXT: fmov d7, x10
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: mov w10, #16384 // =0x4000
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: fmov d2, x15
+; CHECK-NEON-NEXT: mov w15, #4096 // =0x1000
+; CHECK-NEON-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT: fmov d4, x15
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: fmov x15, d0
+; CHECK-NEON-NEXT: eor v0.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: mov w13, #8192 // =0x2000
+; CHECK-NEON-NEXT: fmov x9, d2
+; CHECK-NEON-NEXT: eor v0.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: eor v2.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d5, x14
+; CHECK-NEON-NEXT: fmov x14, d4
+; CHECK-NEON-NEXT: fmov d4, x13
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: and v3.8b, v1.8b, v3.8b
+; CHECK-NEON-NEXT: mul x12, x8, x14
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v7.8b
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: eor v4.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x9
+; CHECK-NEON-NEXT: mov w9, #32768 // =0x8000
+; CHECK-NEON-NEXT: fmov d5, x15
+; CHECK-NEON-NEXT: fmov d7, x12
+; CHECK-NEON-NEXT: fmov d3, x9
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: mov w11, #65536 // =0x10000
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: eor v2.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: and v3.8b, v1.8b, v3.8b
+; CHECK-NEON-NEXT: eor v6.8b, v6.8b, v7.8b
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mov w10, #131072 // =0x20000
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: mov w11, #262144 // =0x40000
+; CHECK-NEON-NEXT: eor v5.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT: and v6.8b, v1.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d3, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov w11, #524288 // =0x80000
+; CHECK-NEON-NEXT: fmov x12, d6
+; CHECK-NEON-NEXT: eor v3.8b, v5.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: fmov d6, x10
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: mov w12, #1048576 // =0x100000
+; CHECK-NEON-NEXT: and v5.8b, v1.8b, v5.8b
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mov w12, #4194304 // =0x400000
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d7, x9
+; CHECK-NEON-NEXT: fmov x9, d5
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov d16, x11
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: mul x12, x8, x9
+; CHECK-NEON-NEXT: mov w9, #8388608 // =0x800000
+; CHECK-NEON-NEXT: and v5.8b, v1.8b, v5.8b
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT: fmov x13, d5
+; CHECK-NEON-NEXT: fmov d16, x10
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: mov w11, #16777216 // =0x1000000
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: mov w11, #2097152 // =0x200000
+; CHECK-NEON-NEXT: fmov d3, x12
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT: movi v16.2s, #128, lsl #24
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: and v5.8b, v1.8b, v5.8b
+; CHECK-NEON-NEXT: eor v3.8b, v7.8b, v3.8b
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: mul x11, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d5
+; CHECK-NEON-NEXT: fmov d5, x13
+; CHECK-NEON-NEXT: fmov x14, d4
+; CHECK-NEON-NEXT: mul x12, x8, x10
+; CHECK-NEON-NEXT: mov w10, #33554432 // =0x2000000
+; CHECK-NEON-NEXT: fmov d4, x10
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: mov w11, #134217728 // =0x8000000
+; CHECK-NEON-NEXT: mul x10, x8, x14
+; CHECK-NEON-NEXT: mov w14, #67108864 // =0x4000000
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: and v6.8b, v1.8b, v7.8b
+; CHECK-NEON-NEXT: fmov d7, x12
+; CHECK-NEON-NEXT: fmov x12, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: fmov x13, d6
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v7.8b
+; CHECK-NEON-NEXT: fneg d7, d16
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: mov w12, #536870912 // =0x20000000
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: mul x12, x8, x13
+; CHECK-NEON-NEXT: mov w13, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT: fmov x14, d4
+; CHECK-NEON-NEXT: fmov d4, x13
+; CHECK-NEON-NEXT: and v6.8b, v1.8b, v6.8b
+; CHECK-NEON-NEXT: mul x13, x8, x14
+; CHECK-NEON-NEXT: mov w14, #268435456 // =0x10000000
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x15, d6
+; CHECK-NEON-NEXT: fmov d6, x14
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov x11, #4294967296 // =0x100000000
+; CHECK-NEON-NEXT: mul x14, x8, x15
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v7.8b
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: fmov d7, x9
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x14
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: mov x11, #8589934592 // =0x200000000
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x10
+; CHECK-NEON-NEXT: mov x14, #1152921504606846976 // =0x1000000000000000
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: mov x11, #17179869184 // =0x400000000
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v7.8b
+; CHECK-NEON-NEXT: eor v3.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov d6, x9
+; CHECK-NEON-NEXT: mov x9, #34359738368 // =0x800000000
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: fmov d2, x9
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x10
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mov x11, #68719476736 // =0x1000000000
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov x11, #137438953472 // =0x2000000000
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov x12, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d6, x9
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mov x12, #274877906944 // =0x4000000000
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mov x12, #549755813888 // =0x8000000000
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: fmov d6, x14
+; CHECK-NEON-NEXT: fmov x13, d2
+; CHECK-NEON-NEXT: fmov d2, x12
+; CHECK-NEON-NEXT: mov x14, #2305843009213693952 // =0x2000000000000000
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: eor v3.8b, v5.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: mul x12, x8, x13
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mov x13, #1099511627776 // =0x10000000000
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: fmov d4, x13
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: mov x11, #2199023255552 // =0x20000000000
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: mov x11, #4398046511104 // =0x40000000000
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: fmov x9, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: mov x11, #8796093022208 // =0x80000000000
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mov x10, #17592186044416 // =0x100000000000
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: fmov d2, x10
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mov x11, #35184372088832 // =0x200000000000
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov x11, #70368744177664 // =0x400000000000
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: fmov x12, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mov x12, #140737488355328 // =0x800000000000
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mov x12, #281474976710656 // =0x1000000000000
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: fmov x13, d2
+; CHECK-NEON-NEXT: fmov d2, x12
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: mul x12, x8, x13
+; CHECK-NEON-NEXT: mov x13, #562949953421312 // =0x2000000000000
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x13
+; CHECK-NEON-NEXT: mov x13, #1125899906842624 // =0x4000000000000
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: fmov d2, x13
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mov x12, #2251799813685248 // =0x8000000000000
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov x12, d2
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: mov x11, #4503599627370496 // =0x10000000000000
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: mul x9, x8, x12
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x10
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: mov x11, #9007199254740992 // =0x20000000000000
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: mov x9, #18014398509481984 // =0x40000000000000
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: fmov d2, x9
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mov x11, #36028797018963968 // =0x80000000000000
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: mul x12, x8, x10
+; CHECK-NEON-NEXT: mov x10, #72057594037927936 // =0x100000000000000
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: fmov d2, x10
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mov x11, #144115188075855872 // =0x200000000000000
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov x11, #288230376151711744 // =0x400000000000000
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: fmov x13, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mul x11, x8, x13
+; CHECK-NEON-NEXT: mov x13, #576460752303423488 // =0x800000000000000
+; CHECK-NEON-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: fmov x12, d4
+; CHECK-NEON-NEXT: fmov d4, x13
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x14
+; CHECK-NEON-NEXT: mov x14, #4611686018427387904 // =0x4000000000000000
+; CHECK-NEON-NEXT: fmov x13, d2
+; CHECK-NEON-NEXT: movi d2, #0000000000000000
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: and v5.8b, v1.8b, v5.8b
+; CHECK-NEON-NEXT: mul x10, x8, x13
+; CHECK-NEON-NEXT: fmov x13, d4
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v6.8b
+; CHECK-NEON-NEXT: fneg d2, d2
+; CHECK-NEON-NEXT: fmov d6, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x14
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov x14, d5
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: and v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: fmov d2, x12
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mul x11, x8, x14
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT: fmov x12, d4
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: fmov x10, d1
+; CHECK-NEON-NEXT: fmov d1, x13
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: eor v1.8b, v2.8b, v1.8b
+; CHECK-NEON-NEXT: fmov d2, x9
+; CHECK-NEON-NEXT: mul x8, x8, x10
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d2, x12
+; CHECK-NEON-NEXT: fmov d3, x8
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v3.8b
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-AES-LABEL: clmul_v1i64_neon:
+; CHECK-AES: // %bb.0:
+; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-AES-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-AES-NEXT: ret
+ %a = call <1 x i64> @llvm.clmul.v1i64(<1 x i64> %x, <1 x i64> %y)
+ ret <1 x i64> %a
+}
define <1 x i128> @clmul_v1i128_neon(<1 x i128> %x, <1 x i128> %y) {
; CHECK-NEON-LABEL: clmul_v1i128_neon:
@@ -5126,25 +6339,1247 @@ define <2 x i32> @clmulr_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
ret <2 x i32> %res
}
-; TODO
-;define <2 x i64> @clmulr_v2i64_neon(<2 x i64> %a, <2 x i64> %b) nounwind {
-; %a.ext = zext <2 x i64> %a to <2 x i128>
-; %b.ext = zext <2 x i64> %b to <2 x i128>
-; %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
-; %res.ext = lshr <2 x i128> %clmul, splat (i128 63)
-; %res = trunc <2 x i128> %res.ext to <2 x i64>
-; ret <2 x i64> %res
-;}
+define <2 x i64> @clmulr_v2i64_neon(<2 x i64> %a, <2 x i64> %b) nounwind {
+; CHECK-NEON-LABEL: clmulr_v2i64_neon:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: sub sp, sp, #464
+; CHECK-NEON-NEXT: rev64 v1.16b, v1.16b
+; CHECK-NEON-NEXT: rev64 v2.16b, v0.16b
+; CHECK-NEON-NEXT: mov w8, #2 // =0x2
+; CHECK-NEON-NEXT: stp x24, x23, [sp, #416] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: mov x2, #4294967296 // =0x100000000
+; CHECK-NEON-NEXT: stp x28, x27, [sp, #384] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp x26, x25, [sp, #400] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: rbit v0.16b, v1.16b
+; CHECK-NEON-NEXT: dup v1.2d, x8
+; CHECK-NEON-NEXT: mov w8, #1 // =0x1
+; CHECK-NEON-NEXT: dup v3.2d, x8
+; CHECK-NEON-NEXT: mov w8, #4 // =0x4
+; CHECK-NEON-NEXT: rbit v2.16b, v2.16b
+; CHECK-NEON-NEXT: dup v4.2d, x8
+; CHECK-NEON-NEXT: mov w8, #8 // =0x8
+; CHECK-NEON-NEXT: stp x29, x30, [sp, #368] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: dup v5.2d, x8
+; CHECK-NEON-NEXT: stp x22, x21, [sp, #432] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: stp x20, x19, [sp, #448] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: and v4.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: fmov x8, d2
+; CHECK-NEON-NEXT: stp d11, d10, [sp, #336] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: and v5.16b, v0.16b, v5.16b
+; CHECK-NEON-NEXT: stp d9, d8, [sp, #352] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: fmov x9, d1
+; CHECK-NEON-NEXT: mov x11, v1.d[1]
+; CHECK-NEON-NEXT: mov x13, v3.d[1]
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: stp d13, d12, [sp, #320] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: fmov x12, d5
+; CHECK-NEON-NEXT: str d14, [sp, #304] // 8-byte Spill
+; CHECK-NEON-NEXT: mul x4, x8, x9
+; CHECK-NEON-NEXT: fmov x9, d3
+; CHECK-NEON-NEXT: mul x6, x8, x10
+; CHECK-NEON-NEXT: mov w10, #16 // =0x10
+; CHECK-NEON-NEXT: dup v1.2d, x10
+; CHECK-NEON-NEXT: mov x10, v4.d[1]
+; CHECK-NEON-NEXT: mul x3, x8, x9
+; CHECK-NEON-NEXT: mov x9, v2.d[1]
+; CHECK-NEON-NEXT: mul x24, x8, x12
+; CHECK-NEON-NEXT: mov x12, v5.d[1]
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov d5, x3
+; CHECK-NEON-NEXT: mov x3, #17179869184 // =0x400000000
+; CHECK-NEON-NEXT: mul x28, x9, x11
+; CHECK-NEON-NEXT: mov w11, #32 // =0x20
+; CHECK-NEON-NEXT: dup v2.2d, x11
+; CHECK-NEON-NEXT: mul x11, x9, x13
+; CHECK-NEON-NEXT: mov w13, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT: str x10, [sp, #296] // 8-byte Spill
+; CHECK-NEON-NEXT: mov w10, #64 // =0x40
+; CHECK-NEON-NEXT: dup v3.2d, x10
+; CHECK-NEON-NEXT: mul x10, x9, x12
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mov w12, #128 // =0x80
+; CHECK-NEON-NEXT: str x11, [sp, #312] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x11, v1.d[1]
+; CHECK-NEON-NEXT: str x10, [sp, #272] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: mul x27, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #256 // =0x100
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #280] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #256] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #512 // =0x200
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #288] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #248] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #1024 // =0x400
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: ldp d25, d18, [sp, #248] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: mov v18.d[1], x27
+; CHECK-NEON-NEXT: mov x27, #137438953472 // =0x2000000000
+; CHECK-NEON-NEXT: str x10, [sp, #264] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #224] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #2048 // =0x800
+; CHECK-NEON-NEXT: ldr d19, [sp, #224] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #232] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #200] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #4096 // =0x1000
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #240] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #192] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #8192 // =0x2000
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: ldp d26, d20, [sp, #192] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #208] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #176] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #16384 // =0x4000
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #216] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #168] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #32768 // =0x8000
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: ldp d17, d21, [sp, #168] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #184] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #152] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #65536 // =0x10000
+; CHECK-NEON-NEXT: ldr d22, [sp, #152] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x15, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #131072 // =0x20000
+; CHECK-NEON-NEXT: str x10, [sp, #144] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: fmov d23, x15
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: str x11, [sp, #120] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #262144 // =0x40000
+; CHECK-NEON-NEXT: mul x17, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: str x10, [sp, #160] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #524288 // =0x80000
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: str x11, [sp, #104] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: str x10, [sp, #128] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: dup v1.2d, x12
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: mov w12, #1048576 // =0x100000
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: str x11, [sp, #80] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: ldr d9, [sp, #80] // 8-byte Reload
+; CHECK-NEON-NEXT: str x10, [sp, #136] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #2097152 // =0x200000
+; CHECK-NEON-NEXT: mul x25, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x30, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: fmov d10, x25
+; CHECK-NEON-NEXT: str x10, [sp, #112] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #4194304 // =0x400000
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: fmov d30, x30
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: str x11, [sp, #48] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: ldr d28, [sp, #48] // 8-byte Reload
+; CHECK-NEON-NEXT: str x10, [sp, #88] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: dup v1.2d, x12
+; CHECK-NEON-NEXT: mov w12, #8388608 // =0x800000
+; CHECK-NEON-NEXT: mul x26, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x29, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: str x10, [sp, #96] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #16777216 // =0x1000000
+; CHECK-NEON-NEXT: mul x21, x8, x11
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: fmov d8, x21
+; CHECK-NEON-NEXT: str x10, [sp, #64] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #33554432 // =0x2000000
+; CHECK-NEON-NEXT: mul x7, x8, x11
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: fmov d11, x7
+; CHECK-NEON-NEXT: str x10, [sp, #72] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: dup v1.2d, x12
+; CHECK-NEON-NEXT: mov w12, #67108864 // =0x4000000
+; CHECK-NEON-NEXT: mul x18, x8, x11
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: fmov d31, x18
+; CHECK-NEON-NEXT: str x10, [sp, #56] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #134217728 // =0x8000000
+; CHECK-NEON-NEXT: mul x1, x8, x11
+; CHECK-NEON-NEXT: mov w11, #536870912 // =0x20000000
+; CHECK-NEON-NEXT: dup v4.2d, x11
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: str x10, [sp, #24] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #268435456 // =0x10000000
+; CHECK-NEON-NEXT: mul x14, x8, x11
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: str x10, [sp, #32] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: dup v1.2d, x12
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mov x12, v1.d[1]
+; CHECK-NEON-NEXT: str x10, [sp, #16] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: mul x23, x9, x12
+; CHECK-NEON-NEXT: movi v4.4s, #128, lsl #24
+; CHECK-NEON-NEXT: mov x12, v3.d[1]
+; CHECK-NEON-NEXT: mul x19, x9, x10
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: fneg v4.2d, v4.2d
+; CHECK-NEON-NEXT: mul x22, x9, x12
+; CHECK-NEON-NEXT: mul x20, x9, x10
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: dup v2.2d, x13
+; CHECK-NEON-NEXT: fmov x13, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mul x16, x8, x10
+; CHECK-NEON-NEXT: mul x0, x8, x13
+; CHECK-NEON-NEXT: fmov x13, d3
+; CHECK-NEON-NEXT: dup v3.2d, x2
+; CHECK-NEON-NEXT: mov x12, v2.d[1]
+; CHECK-NEON-NEXT: mov x2, #8589934592 // =0x200000000
+; CHECK-NEON-NEXT: dup v6.2d, x2
+; CHECK-NEON-NEXT: mul x5, x8, x13
+; CHECK-NEON-NEXT: and v4.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: fmov x13, d2
+; CHECK-NEON-NEXT: fmov d2, x4
+; CHECK-NEON-NEXT: fmov d3, x6
+; CHECK-NEON-NEXT: mov x6, #34359738368 // =0x800000000
+; CHECK-NEON-NEXT: mul x10, x9, x12
+; CHECK-NEON-NEXT: and v7.16b, v0.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d6, x24
+; CHECK-NEON-NEXT: mov x12, v1.d[1]
+; CHECK-NEON-NEXT: mov x24, #68719476736 // =0x1000000000
+; CHECK-NEON-NEXT: mul x4, x8, x13
+; CHECK-NEON-NEXT: mov x13, v4.d[1]
+; CHECK-NEON-NEXT: mov v2.d[1], x28
+; CHECK-NEON-NEXT: str x10, [sp, #40] // 8-byte Spill
+; CHECK-NEON-NEXT: ldr x10, [sp, #312] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x28, x9, x12
+; CHECK-NEON-NEXT: fmov x12, d1
+; CHECK-NEON-NEXT: dup v1.2d, x3
+; CHECK-NEON-NEXT: fmov x3, d4
+; CHECK-NEON-NEXT: mov v5.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #296] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x11, x9, x13
+; CHECK-NEON-NEXT: mov x13, v7.d[1]
+; CHECK-NEON-NEXT: dup v4.2d, x6
+; CHECK-NEON-NEXT: mov v3.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #272] // 8-byte Reload
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x2, x8, x12
+; CHECK-NEON-NEXT: mov x12, #1099511627776 // =0x10000000000
+; CHECK-NEON-NEXT: mov v6.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #280] // 8-byte Reload
+; CHECK-NEON-NEXT: and v4.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: mov x6, v1.d[1]
+; CHECK-NEON-NEXT: eor v2.16b, v5.16b, v2.16b
+; CHECK-NEON-NEXT: mul x3, x8, x3
+; CHECK-NEON-NEXT: mov v25.d[1], x10
+; CHECK-NEON-NEXT: mul x10, x9, x13
+; CHECK-NEON-NEXT: fmov x13, d7
+; CHECK-NEON-NEXT: dup v7.2d, x24
+; CHECK-NEON-NEXT: mov x24, v4.d[1]
+; CHECK-NEON-NEXT: fmov x15, d4
+; CHECK-NEON-NEXT: fmov d4, x17
+; CHECK-NEON-NEXT: eor v5.16b, v3.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d3, x16
+; CHECK-NEON-NEXT: mov x16, #35184372088832 // =0x200000000000
+; CHECK-NEON-NEXT: and v16.16b, v0.16b, v7.16b
+; CHECK-NEON-NEXT: ldr d7, [sp, #120] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v6.16b, v18.16b, v25.16b
+; CHECK-NEON-NEXT: dup v25.2d, x16
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: stp x11, x10, [sp, #272] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: ldr x10, [sp, #288] // 8-byte Reload
+; CHECK-NEON-NEXT: mov x11, #549755813888 // =0x8000000000
+; CHECK-NEON-NEXT: fmov x17, d16
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: mov v3.d[1], x20
+; CHECK-NEON-NEXT: mov v19.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #264] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x24, x9, x24
+; CHECK-NEON-NEXT: mov v20.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #232] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x17, x8, x17
+; CHECK-NEON-NEXT: mov v26.d[1], x10
+; CHECK-NEON-NEXT: eor v6.16b, v6.16b, v19.16b
+; CHECK-NEON-NEXT: mul x10, x9, x6
+; CHECK-NEON-NEXT: fmov x6, d1
+; CHECK-NEON-NEXT: dup v1.2d, x27
+; CHECK-NEON-NEXT: mov x27, v16.d[1]
+; CHECK-NEON-NEXT: dup v16.2d, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #160] // 8-byte Reload
+; CHECK-NEON-NEXT: and v24.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mov v4.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #128] // 8-byte Reload
+; CHECK-NEON-NEXT: and v29.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: dup v16.2d, x12
+; CHECK-NEON-NEXT: mov x12, #2199023255552 // =0x20000000000
+; CHECK-NEON-NEXT: str x10, [sp, #288] // 8-byte Spill
+; CHECK-NEON-NEXT: ldr x10, [sp, #240] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v19.16b, v20.16b, v26.16b
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v25.16b
+; CHECK-NEON-NEXT: mul x6, x8, x6
+; CHECK-NEON-NEXT: mov v21.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #208] // 8-byte Reload
+; CHECK-NEON-NEXT: and v12.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: fmov d16, x29
+; CHECK-NEON-NEXT: mul x27, x9, x27
+; CHECK-NEON-NEXT: mov v17.d[1], x10
+; CHECK-NEON-NEXT: mov x10, #274877906944 // =0x4000000000
+; CHECK-NEON-NEXT: dup v1.2d, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #216] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v22.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #184] // 8-byte Reload
+; CHECK-NEON-NEXT: and v27.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ldr d1, [sp, #104] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v23.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #144] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v1.d[1], x11
+; CHECK-NEON-NEXT: fmov x11, d24
+; CHECK-NEON-NEXT: mov v7.d[1], x10
+; CHECK-NEON-NEXT: mov x10, v24.d[1]
+; CHECK-NEON-NEXT: fmov d24, x26
+; CHECK-NEON-NEXT: mul x30, x8, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #136] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v22.16b, v22.16b, v23.16b
+; CHECK-NEON-NEXT: mul x25, x9, x10
+; CHECK-NEON-NEXT: mov x10, v27.d[1]
+; CHECK-NEON-NEXT: mov v9.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #112] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v7.16b, v22.16b, v7.16b
+; CHECK-NEON-NEXT: mov v10.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #88] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: eor v4.16b, v7.16b, v4.16b
+; CHECK-NEON-NEXT: fmov d7, x13
+; CHECK-NEON-NEXT: mov v30.d[1], x11
+; CHECK-NEON-NEXT: fmov x11, d27
+; CHECK-NEON-NEXT: dup v27.2d, x12
+; CHECK-NEON-NEXT: mov x12, #4398046511104 // =0x40000000000
+; CHECK-NEON-NEXT: mov x13, #1125899906842624 // =0x4000000000000
+; CHECK-NEON-NEXT: eor v23.16b, v9.16b, v10.16b
+; CHECK-NEON-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEON-NEXT: mul x26, x8, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #96] // 8-byte Reload
+; CHECK-NEON-NEXT: and v13.16b, v0.16b, v27.16b
+; CHECK-NEON-NEXT: str x10, [sp, #264] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v29.d[1]
+; CHECK-NEON-NEXT: dup v27.2d, x12
+; CHECK-NEON-NEXT: mov v28.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #64] // 8-byte Reload
+; CHECK-NEON-NEXT: mov x12, #8796093022208 // =0x80000000000
+; CHECK-NEON-NEXT: mov v24.d[1], x11
+; CHECK-NEON-NEXT: fmov x11, d29
+; CHECK-NEON-NEXT: and v14.16b, v0.16b, v27.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov d27, x14
+; CHECK-NEON-NEXT: mov x14, #17592186044416 // =0x100000000000
+; CHECK-NEON-NEXT: fmov d29, x1
+; CHECK-NEON-NEXT: mul x7, x8, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #72] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v27.d[1], x19
+; CHECK-NEON-NEXT: mov v16.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #56] // 8-byte Reload
+; CHECK-NEON-NEXT: str x10, [sp, #256] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v12.d[1]
+; CHECK-NEON-NEXT: mov v8.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #24] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v11.d[1], x11
+; CHECK-NEON-NEXT: fmov x11, d12
+; CHECK-NEON-NEXT: dup v12.2d, x12
+; CHECK-NEON-NEXT: mul x18, x9, x10
+; CHECK-NEON-NEXT: mov x10, v13.d[1]
+; CHECK-NEON-NEXT: and v12.16b, v0.16b, v12.16b
+; CHECK-NEON-NEXT: mul x29, x8, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #32] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x12, x9, x10
+; CHECK-NEON-NEXT: fmov x10, d13
+; CHECK-NEON-NEXT: dup v13.2d, x14
+; CHECK-NEON-NEXT: mov v31.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #16] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x14, x8, x10
+; CHECK-NEON-NEXT: mov x10, v12.d[1]
+; CHECK-NEON-NEXT: and v18.16b, v0.16b, v13.16b
+; CHECK-NEON-NEXT: eor v13.16b, v2.16b, v5.16b
+; CHECK-NEON-NEXT: fmov d2, x0
+; CHECK-NEON-NEXT: mov x0, #70368744177664 // =0x400000000000
+; CHECK-NEON-NEXT: dup v25.2d, x0
+; CHECK-NEON-NEXT: fmov d5, x5
+; CHECK-NEON-NEXT: mov x5, #140737488355328 // =0x800000000000
+; CHECK-NEON-NEXT: mov x16, v18.d[1]
+; CHECK-NEON-NEXT: mov v29.d[1], x11
+; CHECK-NEON-NEXT: mov x11, v14.d[1]
+; CHECK-NEON-NEXT: mul x1, x9, x10
+; CHECK-NEON-NEXT: fmov x10, d12
+; CHECK-NEON-NEXT: eor v26.16b, v13.16b, v6.16b
+; CHECK-NEON-NEXT: eor v6.16b, v19.16b, v21.16b
+; CHECK-NEON-NEXT: dup v19.2d, x5
+; CHECK-NEON-NEXT: mov v5.d[1], x22
+; CHECK-NEON-NEXT: mov v2.d[1], x23
+; CHECK-NEON-NEXT: ldp d13, d12, [sp, #320] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x0, x8, x10
+; CHECK-NEON-NEXT: mov x10, v20.d[1]
+; CHECK-NEON-NEXT: eor v21.16b, v6.16b, v17.16b
+; CHECK-NEON-NEXT: fmov d17, x4
+; CHECK-NEON-NEXT: fmov d6, x2
+; CHECK-NEON-NEXT: mul x16, x9, x16
+; CHECK-NEON-NEXT: mov x2, #281474976710656 // =0x1000000000000
+; CHECK-NEON-NEXT: mov x4, #562949953421312 // =0x2000000000000
+; CHECK-NEON-NEXT: dup v22.2d, x4
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mov v6.d[1], x28
+; CHECK-NEON-NEXT: mul x21, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d14
+; CHECK-NEON-NEXT: ldr d14, [sp, #304] // 8-byte Reload
+; CHECK-NEON-NEXT: str x16, [sp, #312] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x16, d18
+; CHECK-NEON-NEXT: and v18.16b, v0.16b, v25.16b
+; CHECK-NEON-NEXT: dup v25.2d, x2
+; CHECK-NEON-NEXT: str x10, [sp, #296] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x10, d20
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v19.16b
+; CHECK-NEON-NEXT: mul x19, x8, x16
+; CHECK-NEON-NEXT: mov x16, v18.d[1]
+; CHECK-NEON-NEXT: eor v19.16b, v26.16b, v21.16b
+; CHECK-NEON-NEXT: eor v21.16b, v23.16b, v30.16b
+; CHECK-NEON-NEXT: and v23.16b, v0.16b, v25.16b
+; CHECK-NEON-NEXT: eor v25.16b, v8.16b, v11.16b
+; CHECK-NEON-NEXT: mul x5, x8, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #40] // 8-byte Reload
+; CHECK-NEON-NEXT: dup v26.2d, x13
+; CHECK-NEON-NEXT: eor v1.16b, v19.16b, v1.16b
+; CHECK-NEON-NEXT: fmov d19, x6
+; CHECK-NEON-NEXT: mov v17.d[1], x10
+; CHECK-NEON-NEXT: mov x10, v20.d[1]
+; CHECK-NEON-NEXT: mul x2, x9, x16
+; CHECK-NEON-NEXT: fmov x16, d18
+; CHECK-NEON-NEXT: fmov d18, x3
+; CHECK-NEON-NEXT: eor v21.16b, v21.16b, v28.16b
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: ldp d9, d8, [sp, #352] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: ldp d11, d10, [sp, #336] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x3, x9, x10
+; CHECK-NEON-NEXT: fmov x10, d20
+; CHECK-NEON-NEXT: eor v4.16b, v21.16b, v24.16b
+; CHECK-NEON-NEXT: eor v21.16b, v25.16b, v31.16b
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v22.16b
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v17.16b
+; CHECK-NEON-NEXT: mul x20, x8, x16
+; CHECK-NEON-NEXT: mov x16, v23.d[1]
+; CHECK-NEON-NEXT: fmov d17, x30
+; CHECK-NEON-NEXT: eor v4.16b, v4.16b, v16.16b
+; CHECK-NEON-NEXT: fmov d16, x15
+; CHECK-NEON-NEXT: mov x15, #4503599627370496 // =0x10000000000000
+; CHECK-NEON-NEXT: mul x13, x8, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #272] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v21.16b, v21.16b, v29.16b
+; CHECK-NEON-NEXT: dup v24.2d, x15
+; CHECK-NEON-NEXT: mov x4, v20.d[1]
+; CHECK-NEON-NEXT: fmov x15, d20
+; CHECK-NEON-NEXT: mov v18.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #280] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x23, x9, x16
+; CHECK-NEON-NEXT: eor v21.16b, v21.16b, v27.16b
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v6.16b
+; CHECK-NEON-NEXT: mov v17.d[1], x25
+; CHECK-NEON-NEXT: mov v7.d[1], x10
+; CHECK-NEON-NEXT: mov x10, #2251799813685248 // =0x8000000000000
+; CHECK-NEON-NEXT: mov x25, #18014398509481984 // =0x40000000000000
+; CHECK-NEON-NEXT: dup v22.2d, x10
+; CHECK-NEON-NEXT: fmov x10, d23
+; CHECK-NEON-NEXT: and v23.16b, v0.16b, v26.16b
+; CHECK-NEON-NEXT: eor v3.16b, v21.16b, v3.16b
+; CHECK-NEON-NEXT: mul x6, x9, x4
+; CHECK-NEON-NEXT: ldr x4, [sp, #288] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: mov v16.d[1], x24
+; CHECK-NEON-NEXT: fmov d4, x17
+; CHECK-NEON-NEXT: mov x16, v23.d[1]
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v22.16b
+; CHECK-NEON-NEXT: and v22.16b, v0.16b, v24.16b
+; CHECK-NEON-NEXT: mov v19.d[1], x4
+; CHECK-NEON-NEXT: fmov x4, d23
+; CHECK-NEON-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v18.16b
+; CHECK-NEON-NEXT: fmov d23, x26
+; CHECK-NEON-NEXT: fmov d18, x7
+; CHECK-NEON-NEXT: mov x24, v20.d[1]
+; CHECK-NEON-NEXT: fmov d6, x29
+; CHECK-NEON-NEXT: fmov d5, x14
+; CHECK-NEON-NEXT: mul x22, x9, x16
+; CHECK-NEON-NEXT: mov x16, #9007199254740992 // =0x20000000000000
+; CHECK-NEON-NEXT: mov x14, #72057594037927936 // =0x100000000000000
+; CHECK-NEON-NEXT: dup v21.2d, x16
+; CHECK-NEON-NEXT: mov x16, v22.d[1]
+; CHECK-NEON-NEXT: eor v3.16b, v3.16b, v7.16b
+; CHECK-NEON-NEXT: mul x17, x8, x4
+; CHECK-NEON-NEXT: fmov x4, d20
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: dup v2.2d, x14
+; CHECK-NEON-NEXT: mov v6.d[1], x18
+; CHECK-NEON-NEXT: mov v5.d[1], x12
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v21.16b
+; CHECK-NEON-NEXT: dup v21.2d, x25
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mov v4.d[1], x27
+; CHECK-NEON-NEXT: ldp x29, x30, [sp, #368] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x7, x9, x16
+; CHECK-NEON-NEXT: mov x16, #36028797018963968 // =0x80000000000000
+; CHECK-NEON-NEXT: dup v7.2d, x16
+; CHECK-NEON-NEXT: ldr x16, [sp, #264] // 8-byte Reload
+; CHECK-NEON-NEXT: and v21.16b, v0.16b, v21.16b
+; CHECK-NEON-NEXT: mov x26, v20.d[1]
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: mov v23.d[1], x16
+; CHECK-NEON-NEXT: ldr x16, [sp, #256] // 8-byte Reload
+; CHECK-NEON-NEXT: mov x18, v21.d[1]
+; CHECK-NEON-NEXT: mul x27, x8, x4
+; CHECK-NEON-NEXT: fmov x4, d22
+; CHECK-NEON-NEXT: mov v18.d[1], x16
+; CHECK-NEON-NEXT: fmov x16, d20
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v7.16b
+; CHECK-NEON-NEXT: eor v7.16b, v3.16b, v19.16b
+; CHECK-NEON-NEXT: and v19.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: fmov d2, x0
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: mul x24, x9, x24
+; CHECK-NEON-NEXT: mov x12, v20.d[1]
+; CHECK-NEON-NEXT: fmov x0, d20
+; CHECK-NEON-NEXT: fmov d20, x13
+; CHECK-NEON-NEXT: mul x14, x8, x16
+; CHECK-NEON-NEXT: fmov x16, d21
+; CHECK-NEON-NEXT: eor v7.16b, v7.16b, v16.16b
+; CHECK-NEON-NEXT: eor v16.16b, v17.16b, v23.16b
+; CHECK-NEON-NEXT: fmov d17, x20
+; CHECK-NEON-NEXT: mov x20, v19.d[1]
+; CHECK-NEON-NEXT: mul x11, x9, x18
+; CHECK-NEON-NEXT: mov x18, #144115188075855872 // =0x200000000000000
+; CHECK-NEON-NEXT: mov v20.d[1], x3
+; CHECK-NEON-NEXT: dup v21.2d, x18
+; CHECK-NEON-NEXT: mov v3.d[1], x21
+; CHECK-NEON-NEXT: mov v2.d[1], x1
+; CHECK-NEON-NEXT: mul x18, x8, x16
+; CHECK-NEON-NEXT: mov v17.d[1], x2
+; CHECK-NEON-NEXT: eor v16.16b, v16.16b, v18.16b
+; CHECK-NEON-NEXT: mov x2, #576460752303423488 // =0x800000000000000
+; CHECK-NEON-NEXT: mov x3, #2305843009213693952 // =0x2000000000000000
+; CHECK-NEON-NEXT: eor v4.16b, v7.16b, v4.16b
+; CHECK-NEON-NEXT: mul x16, x9, x12
+; CHECK-NEON-NEXT: mov x12, #288230376151711744 // =0x400000000000000
+; CHECK-NEON-NEXT: and v21.16b, v0.16b, v21.16b
+; CHECK-NEON-NEXT: dup v18.2d, x12
+; CHECK-NEON-NEXT: fmov x12, d19
+; CHECK-NEON-NEXT: fmov d19, x10
+; CHECK-NEON-NEXT: eor v6.16b, v16.16b, v6.16b
+; CHECK-NEON-NEXT: dup v16.2d, x2
+; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v20.16b
+; CHECK-NEON-NEXT: fmov d20, x15
+; CHECK-NEON-NEXT: mul x13, x8, x0
+; CHECK-NEON-NEXT: mov x10, v21.d[1]
+; CHECK-NEON-NEXT: mov v19.d[1], x23
+; CHECK-NEON-NEXT: and v18.16b, v0.16b, v18.16b
+; CHECK-NEON-NEXT: fmov x15, d21
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: and v16.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: fmov d21, x17
+; CHECK-NEON-NEXT: mov v20.d[1], x6
+; CHECK-NEON-NEXT: mov x17, #1152921504606846976 // =0x1000000000000000
+; CHECK-NEON-NEXT: eor v5.16b, v6.16b, v5.16b
+; CHECK-NEON-NEXT: mul x0, x9, x20
+; CHECK-NEON-NEXT: mov x1, v18.d[1]
+; CHECK-NEON-NEXT: dup v7.2d, x3
+; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v19.16b
+; CHECK-NEON-NEXT: mov x2, v16.d[1]
+; CHECK-NEON-NEXT: dup v19.2d, x17
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: mov v21.d[1], x22
+; CHECK-NEON-NEXT: fmov x17, d18
+; CHECK-NEON-NEXT: fmov d18, x27
+; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v3.16b
+; CHECK-NEON-NEXT: and v7.16b, v0.16b, v7.16b
+; CHECK-NEON-NEXT: eor v6.16b, v17.16b, v20.16b
+; CHECK-NEON-NEXT: fmov d17, x12
+; CHECK-NEON-NEXT: mul x25, x8, x4
+; CHECK-NEON-NEXT: and v19.16b, v0.16b, v19.16b
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mov v18.d[1], x24
+; CHECK-NEON-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT: mov v17.d[1], x0
+; CHECK-NEON-NEXT: eor v5.16b, v6.16b, v21.16b
+; CHECK-NEON-NEXT: movi v6.2d, #0000000000000000
+; CHECK-NEON-NEXT: mul x0, x9, x2
+; CHECK-NEON-NEXT: fmov x2, d16
+; CHECK-NEON-NEXT: fmov v16.2d, #2.00000000
+; CHECK-NEON-NEXT: fmov d20, x15
+; CHECK-NEON-NEXT: mov x12, v19.d[1]
+; CHECK-NEON-NEXT: fmov d21, x25
+; CHECK-NEON-NEXT: mul x17, x8, x17
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v18.16b
+; CHECK-NEON-NEXT: fneg v6.2d, v6.2d
+; CHECK-NEON-NEXT: ldp x22, x21, [sp, #432] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x15, x8, x2
+; CHECK-NEON-NEXT: fmov x2, d19
+; CHECK-NEON-NEXT: and v16.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: mov v20.d[1], x10
+; CHECK-NEON-NEXT: mov x10, v7.d[1]
+; CHECK-NEON-NEXT: mov v21.d[1], x7
+; CHECK-NEON-NEXT: mul x1, x9, x1
+; CHECK-NEON-NEXT: ldp x24, x23, [sp, #416] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: fmov d19, x17
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d6, x14
+; CHECK-NEON-NEXT: mul x17, x8, x2
+; CHECK-NEON-NEXT: ldr x14, [sp, #312] // 8-byte Reload
+; CHECK-NEON-NEXT: fmov d18, x15
+; CHECK-NEON-NEXT: mov x15, v16.d[1]
+; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v20.16b
+; CHECK-NEON-NEXT: mul x4, x9, x26
+; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v21.16b
+; CHECK-NEON-NEXT: mov v19.d[1], x1
+; CHECK-NEON-NEXT: fmov x1, d7
+; CHECK-NEON-NEXT: fmov d7, x19
+; CHECK-NEON-NEXT: mul x12, x9, x12
+; CHECK-NEON-NEXT: mov v18.d[1], x0
+; CHECK-NEON-NEXT: ldp x20, x19, [sp, #448] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x0, x8, x1
+; CHECK-NEON-NEXT: mov v7.d[1], x14
+; CHECK-NEON-NEXT: eor v5.16b, v17.16b, v19.16b
+; CHECK-NEON-NEXT: fmov d17, x17
+; CHECK-NEON-NEXT: fmov x17, d0
+; CHECK-NEON-NEXT: mul x14, x9, x15
+; CHECK-NEON-NEXT: fmov x15, d16
+; CHECK-NEON-NEXT: mov v6.d[1], x4
+; CHECK-NEON-NEXT: fmov d16, x5
+; CHECK-NEON-NEXT: ldp x26, x25, [sp, #400] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mov v17.d[1], x12
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v18.16b
+; CHECK-NEON-NEXT: mov x12, v0.d[1]
+; CHECK-NEON-NEXT: fmov d0, x18
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v7.16b
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: fmov d7, x0
+; CHECK-NEON-NEXT: eor v3.16b, v3.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: ldp x28, x27, [sp, #384] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x8, x8, x17
+; CHECK-NEON-NEXT: ldr x17, [sp, #296] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v0.d[1], x11
+; CHECK-NEON-NEXT: mov v7.d[1], x10
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v17.16b
+; CHECK-NEON-NEXT: mov v16.d[1], x17
+; CHECK-NEON-NEXT: mul x9, x9, x12
+; CHECK-NEON-NEXT: mov v6.d[1], x16
+; CHECK-NEON-NEXT: fmov d17, x15
+; CHECK-NEON-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEON-NEXT: fmov d4, x8
+; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v7.16b
+; CHECK-NEON-NEXT: mov v17.d[1], x14
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v16.16b
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v6.16b
+; CHECK-NEON-NEXT: mov v4.d[1], x9
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: eor v2.16b, v3.16b, v17.16b
+; CHECK-NEON-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: eor v1.16b, v2.16b, v4.16b
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: rev64 v0.16b, v0.16b
+; CHECK-NEON-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEON-NEXT: add sp, sp, #464
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-AES-LABEL: clmulr_v2i64_neon:
+; CHECK-AES: // %bb.0:
+; CHECK-AES-NEXT: rev64 v1.16b, v1.16b
+; CHECK-AES-NEXT: rev64 v0.16b, v0.16b
+; CHECK-AES-NEXT: rbit v1.16b, v1.16b
+; CHECK-AES-NEXT: rbit v0.16b, v0.16b
+; CHECK-AES-NEXT: pmull2 v2.1q, v0.2d, v1.2d
+; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-AES-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-AES-NEXT: rev64 v0.16b, v0.16b
+; CHECK-AES-NEXT: rbit v0.16b, v0.16b
+; CHECK-AES-NEXT: ret
+ %a.ext = zext <2 x i64> %a to <2 x i128>
+ %b.ext = zext <2 x i64> %b to <2 x i128>
+ %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+ %res.ext = lshr <2 x i128> %clmul, splat (i128 63)
+ %res = trunc <2 x i128> %res.ext to <2 x i64>
+ ret <2 x i64> %res
+}
-; TODO
-;define <1 x i64> @clmulr_v1i64_neon(<1 x i64> %a, <1 x i64> %b) nounwind {
-; %a.ext = zext <1 x i64> %a to <1 x i128>
-; %b.ext = zext <1 x i64> %b to <1 x i128>
-; %clmul = call <1 x i128> @llvm.clmul.v2i128(<1 x i128> %a.ext, <1 x i128> %b.ext)
-; %res.ext = lshr <1 x i128> %clmul, splat (i128 63)
-; %res = trunc <1 x i128> %res.ext to <1 x i64>
-; ret <1 x i64> %res
-;}
+define <1 x i64> @clmulr_v1i64_neon(<1 x i64> %a, <1 x i64> %b) nounwind {
+; CHECK-NEON-LABEL: clmulr_v1i64_neon:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: rev64 v1.8b, v1.8b
+; CHECK-NEON-NEXT: mov w8, #2 // =0x2
+; CHECK-NEON-NEXT: rev64 v2.8b, v0.8b
+; CHECK-NEON-NEXT: mov w10, #8 // =0x8
+; CHECK-NEON-NEXT: mov w11, #16 // =0x10
+; CHECK-NEON-NEXT: mov w12, #32 // =0x20
+; CHECK-NEON-NEXT: mov w13, #64 // =0x40
+; CHECK-NEON-NEXT: mov w14, #128 // =0x80
+; CHECK-NEON-NEXT: mov w15, #256 // =0x100
+; CHECK-NEON-NEXT: rbit v0.8b, v1.8b
+; CHECK-NEON-NEXT: fmov d1, x8
+; CHECK-NEON-NEXT: mov w8, #1 // =0x1
+; CHECK-NEON-NEXT: fmov d3, x8
+; CHECK-NEON-NEXT: rbit v2.8b, v2.8b
+; CHECK-NEON-NEXT: mov w8, #4 // =0x4
+; CHECK-NEON-NEXT: fmov d4, x8
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v1.8b
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov x8, d2
+; CHECK-NEON-NEXT: fmov d2, x10
+; CHECK-NEON-NEXT: fmov x9, d1
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x10, d3
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: fmov d4, x14
+; CHECK-NEON-NEXT: mov w14, #512 // =0x200
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: fmov d5, x14
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d3, x12
+; CHECK-NEON-NEXT: fmov x12, d2
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d3, x13
+; CHECK-NEON-NEXT: fmov x13, d1
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: fmov d3, x15
+; CHECK-NEON-NEXT: fmov x14, d2
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: mov w15, #1024 // =0x400
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: fmov x9, d1
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: mov w11, #2048 // =0x800
+; CHECK-NEON-NEXT: eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d5, x15
+; CHECK-NEON-NEXT: fmov x12, d3
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v6.8b
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: fmov x12, d1
+; CHECK-NEON-NEXT: fmov d1, x13
+; CHECK-NEON-NEXT: mov w13, #4096 // =0x1000
+; CHECK-NEON-NEXT: eor v2.8b, v4.8b, v2.8b
+; CHECK-NEON-NEXT: fmov d4, x14
+; CHECK-NEON-NEXT: fmov x14, d5
+; CHECK-NEON-NEXT: fmov d5, x13
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: mov w11, #8192 // =0x2000
+; CHECK-NEON-NEXT: fmov x10, d3
+; CHECK-NEON-NEXT: mul x13, x8, x14
+; CHECK-NEON-NEXT: eor v3.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov w11, #16384 // =0x4000
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: eor v1.8b, v2.8b, v1.8b
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: mov w9, #32768 // =0x8000
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: mul x9, x8, x10
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: mov w10, #65536 // =0x10000
+; CHECK-NEON-NEXT: fmov d2, x10
+; CHECK-NEON-NEXT: mov w10, #131072 // =0x20000
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: fmov d4, x10
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: mov w11, #262144 // =0x40000
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x9, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: mov w11, #524288 // =0x80000
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov x12, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov w11, #1048576 // =0x100000
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: mul x13, x8, x10
+; CHECK-NEON-NEXT: mov w10, #2097152 // =0x200000
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: mov w10, #4194304 // =0x400000
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: fmov x9, d6
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mul x10, x8, x9
+; CHECK-NEON-NEXT: mov w9, #8388608 // =0x800000
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov x12, d3
+; CHECK-NEON-NEXT: fmov d3, x9
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: fmov x14, d5
+; CHECK-NEON-NEXT: mul x9, x8, x12
+; CHECK-NEON-NEXT: mov w12, #16777216 // =0x1000000
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: mov w12, #33554432 // =0x2000000
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: fmov x13, d3
+; CHECK-NEON-NEXT: fmov d3, x12
+; CHECK-NEON-NEXT: mov w12, #67108864 // =0x4000000
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: and v7.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: eor v3.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: fmov x15, d5
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x11, d7
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: mul x12, x8, x15
+; CHECK-NEON-NEXT: mov w15, #134217728 // =0x8000000
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d6, x15
+; CHECK-NEON-NEXT: mov w15, #536870912 // =0x20000000
+; CHECK-NEON-NEXT: fmov x14, d4
+; CHECK-NEON-NEXT: fmov d16, x13
+; CHECK-NEON-NEXT: fmov d17, x15
+; CHECK-NEON-NEXT: movi v4.2s, #128, lsl #24
+; CHECK-NEON-NEXT: mov w15, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v6.8b
+; CHECK-NEON-NEXT: mul x13, x8, x14
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT: fmov d16, x15
+; CHECK-NEON-NEXT: mov w15, #268435456 // =0x10000000
+; CHECK-NEON-NEXT: fmov x14, d6
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fneg d4, d4
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: fmov d5, x15
+; CHECK-NEON-NEXT: fmov x15, d6
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: mul x12, x8, x15
+; CHECK-NEON-NEXT: eor v2.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT: fmov x15, d16
+; CHECK-NEON-NEXT: fmov d6, x10
+; CHECK-NEON-NEXT: mul x10, x8, x15
+; CHECK-NEON-NEXT: mov x15, #4294967296 // =0x100000000
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: fmov x11, d5
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x15
+; CHECK-NEON-NEXT: fmov d7, x12
+; CHECK-NEON-NEXT: mov x15, #281474976710656 // =0x1000000000000
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: mov x13, #8589934592 // =0x200000000
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d17, x13
+; CHECK-NEON-NEXT: fmov d16, x10
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: mov x13, #549755813888 // =0x8000000000
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x14
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: mov x14, #17592186044416 // =0x100000000000
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v3.8b
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x9
+; CHECK-NEON-NEXT: mov x9, #17179869184 // =0x400000000
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: eor v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: mov x11, #34359738368 // =0x800000000
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mov x10, #137438953472 // =0x2000000000
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: fmov d4, x10
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: eor v5.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: fmov d6, x9
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: mov x11, #274877906944 // =0x4000000000
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x9, d3
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: mov x11, #68719476736 // =0x1000000000
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: mov x13, #1099511627776 // =0x10000000000
+; CHECK-NEON-NEXT: fmov x12, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: fmov d7, x10
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x12, d3
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: eor v2.8b, v5.8b, v7.8b
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: mov x14, #35184372088832 // =0x200000000000
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v6.8b
+; CHECK-NEON-NEXT: fmov x10, d3
+; CHECK-NEON-NEXT: fmov d3, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov x11, #2199023255552 // =0x20000000000
+; CHECK-NEON-NEXT: mul x13, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d6
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: mov x11, #4398046511104 // =0x40000000000
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: mul x12, x8, x10
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v6.8b
+; CHECK-NEON-NEXT: mul x10, x8, x9
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: mov x9, #8796093022208 // =0x80000000000
+; CHECK-NEON-NEXT: fmov x11, d5
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: fmov x12, d3
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mov x11, #70368744177664 // =0x400000000000
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v7.8b
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: mov x12, #140737488355328 // =0x800000000000
+; CHECK-NEON-NEXT: fmov x13, d5
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d16, x9
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mul x12, x8, x13
+; CHECK-NEON-NEXT: fmov x13, d3
+; CHECK-NEON-NEXT: fmov d3, x14
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v16.8b
+; CHECK-NEON-NEXT: fmov x14, d5
+; CHECK-NEON-NEXT: fmov d5, x15
+; CHECK-NEON-NEXT: mov x15, #562949953421312 // =0x2000000000000
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: fmov d7, x15
+; CHECK-NEON-NEXT: fmov x15, d6
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: and v7.8b, v0.8b, v7.8b
+; CHECK-NEON-NEXT: mul x9, x8, x15
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEON-NEXT: fmov x10, d5
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: fmov x11, d6
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: mov x13, #1125899906842624 // =0x4000000000000
+; CHECK-NEON-NEXT: fmov d16, x13
+; CHECK-NEON-NEXT: mov x13, #2251799813685248 // =0x8000000000000
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mul x15, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d7
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: fmov d17, x13
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: mul x14, x8, x10
+; CHECK-NEON-NEXT: eor v7.8b, v6.8b, v7.8b
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: mov x11, #4503599627370496 // =0x10000000000000
+; CHECK-NEON-NEXT: fmov x12, d16
+; CHECK-NEON-NEXT: fmov d16, x11
+; CHECK-NEON-NEXT: fmov d18, x15
+; CHECK-NEON-NEXT: mov x15, #288230376151711744 // =0x400000000000000
+; CHECK-NEON-NEXT: fmov x13, d17
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: mov x12, #9007199254740992 // =0x20000000000000
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: fmov d17, x12
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d18, x14
+; CHECK-NEON-NEXT: mul x12, x8, x13
+; CHECK-NEON-NEXT: mov x13, #72057594037927936 // =0x100000000000000
+; CHECK-NEON-NEXT: fmov x14, d16
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fmov d16, x13
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d18, x11
+; CHECK-NEON-NEXT: mul x13, x8, x14
+; CHECK-NEON-NEXT: mov x14, #144115188075855872 // =0x200000000000000
+; CHECK-NEON-NEXT: fmov x11, d17
+; CHECK-NEON-NEXT: fmov d17, x14
+; CHECK-NEON-NEXT: mov x14, #18014398509481984 // =0x40000000000000
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d18, x14
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fmov x14, d16
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d18, x12
+; CHECK-NEON-NEXT: fmov x12, d17
+; CHECK-NEON-NEXT: fmov d17, x15
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: mul x15, x8, x12
+; CHECK-NEON-NEXT: mov x12, #576460752303423488 // =0x800000000000000
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: fmov x12, d16
+; CHECK-NEON-NEXT: fmov d6, x14
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: eor v5.8b, v7.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d7, x13
+; CHECK-NEON-NEXT: fmov x13, d17
+; CHECK-NEON-NEXT: fmov d16, x15
+; CHECK-NEON-NEXT: mov x15, #1152921504606846976 // =0x1000000000000000
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: fmov x14, d3
+; CHECK-NEON-NEXT: eor v3.8b, v5.8b, v7.8b
+; CHECK-NEON-NEXT: fmov d5, x15
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: mov x15, #2305843009213693952 // =0x2000000000000000
+; CHECK-NEON-NEXT: eor v6.8b, v6.8b, v16.8b
+; CHECK-NEON-NEXT: fmov d7, x15
+; CHECK-NEON-NEXT: mov x15, #36028797018963968 // =0x80000000000000
+; CHECK-NEON-NEXT: movi d16, #0000000000000000
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d17, x15
+; CHECK-NEON-NEXT: and v7.8b, v0.8b, v7.8b
+; CHECK-NEON-NEXT: fmov d18, x13
+; CHECK-NEON-NEXT: fmov x13, d5
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fneg d16, d16
+; CHECK-NEON-NEXT: fmov d5, x14
+; CHECK-NEON-NEXT: mov x14, #4611686018427387904 // =0x4000000000000000
+; CHECK-NEON-NEXT: fmov x15, d7
+; CHECK-NEON-NEXT: eor v6.8b, v6.8b, v18.8b
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: fmov x14, d17
+; CHECK-NEON-NEXT: fmov d17, x9
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: eor v5.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: and v7.8b, v0.8b, v7.8b
+; CHECK-NEON-NEXT: and v0.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v17.8b
+; CHECK-NEON-NEXT: mul x9, x8, x14
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: fmov x11, d7
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x10
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: fmov d0, x15
+; CHECK-NEON-NEXT: eor v2.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mul x8, x8, x11
+; CHECK-NEON-NEXT: eor v0.8b, v5.8b, v0.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d3, x8
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT: rev64 v0.8b, v0.8b
+; CHECK-NEON-NEXT: rbit v0.8b, v0.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-AES-LABEL: clmulr_v1i64_neon:
+; CHECK-AES: // %bb.0:
+; CHECK-AES-NEXT: rev64 v1.8b, v1.8b
+; CHECK-AES-NEXT: rev64 v0.8b, v0.8b
+; CHECK-AES-NEXT: rbit v1.8b, v1.8b
+; CHECK-AES-NEXT: rbit v0.8b, v0.8b
+; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-AES-NEXT: rev64 v0.8b, v0.8b
+; CHECK-AES-NEXT: rbit v0.8b, v0.8b
+; CHECK-AES-NEXT: ret
+ %a.ext = zext <1 x i64> %a to <1 x i128>
+ %b.ext = zext <1 x i64> %b to <1 x i128>
+ %clmul = call <1 x i128> @llvm.clmul.v2i128(<1 x i128> %a.ext, <1 x i128> %b.ext)
+ %res.ext = lshr <1 x i128> %clmul, splat (i128 63)
+ %res = trunc <1 x i128> %res.ext to <1 x i64>
+ ret <1 x i64> %res
+}
define <16 x i8> @clmulh_v16i8_neon(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECK-LABEL: clmulh_v16i8_neon:
@@ -5716,22 +8151,1248 @@ define <2 x i32> @clmulh_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
ret <2 x i32> %res
}
-; TODO
-;define <2 x i64> @clmulh_v2i64_neon(<2 x i64> %a, <2 x i64> %b) nounwind {
-; %a.ext = zext <2 x i64> %a to <2 x i128>
-; %b.ext = zext <2 x i64> %b to <2 x i128>
-; %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
-; %res.ext = lshr <2 x i128> %clmul, splat (i128 64)
-; %res = trunc <2 x i128> %res.ext to <2 x i64>
-; ret <2 x i64> %res
-;}
+define <2 x i64> @clmulh_v2i64_neon(<2 x i64> %a, <2 x i64> %b) nounwind {
+; CHECK-NEON-LABEL: clmulh_v2i64_neon:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: sub sp, sp, #464
+; CHECK-NEON-NEXT: rev64 v1.16b, v1.16b
+; CHECK-NEON-NEXT: rev64 v2.16b, v0.16b
+; CHECK-NEON-NEXT: mov w8, #2 // =0x2
+; CHECK-NEON-NEXT: stp x24, x23, [sp, #416] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: mov x2, #4294967296 // =0x100000000
+; CHECK-NEON-NEXT: stp x28, x27, [sp, #384] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: stp x26, x25, [sp, #400] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: rbit v0.16b, v1.16b
+; CHECK-NEON-NEXT: dup v1.2d, x8
+; CHECK-NEON-NEXT: mov w8, #1 // =0x1
+; CHECK-NEON-NEXT: dup v3.2d, x8
+; CHECK-NEON-NEXT: mov w8, #4 // =0x4
+; CHECK-NEON-NEXT: rbit v2.16b, v2.16b
+; CHECK-NEON-NEXT: dup v4.2d, x8
+; CHECK-NEON-NEXT: mov w8, #8 // =0x8
+; CHECK-NEON-NEXT: stp x29, x30, [sp, #368] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: dup v5.2d, x8
+; CHECK-NEON-NEXT: stp x22, x21, [sp, #432] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: stp x20, x19, [sp, #448] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: and v4.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: fmov x8, d2
+; CHECK-NEON-NEXT: stp d11, d10, [sp, #336] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: and v5.16b, v0.16b, v5.16b
+; CHECK-NEON-NEXT: stp d9, d8, [sp, #352] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: fmov x9, d1
+; CHECK-NEON-NEXT: mov x11, v1.d[1]
+; CHECK-NEON-NEXT: mov x13, v3.d[1]
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: stp d13, d12, [sp, #320] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: fmov x12, d5
+; CHECK-NEON-NEXT: str d14, [sp, #304] // 8-byte Spill
+; CHECK-NEON-NEXT: mul x4, x8, x9
+; CHECK-NEON-NEXT: fmov x9, d3
+; CHECK-NEON-NEXT: mul x6, x8, x10
+; CHECK-NEON-NEXT: mov w10, #16 // =0x10
+; CHECK-NEON-NEXT: dup v1.2d, x10
+; CHECK-NEON-NEXT: mov x10, v4.d[1]
+; CHECK-NEON-NEXT: mul x3, x8, x9
+; CHECK-NEON-NEXT: mov x9, v2.d[1]
+; CHECK-NEON-NEXT: mul x24, x8, x12
+; CHECK-NEON-NEXT: mov x12, v5.d[1]
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov d5, x3
+; CHECK-NEON-NEXT: mov x3, #17179869184 // =0x400000000
+; CHECK-NEON-NEXT: mul x28, x9, x11
+; CHECK-NEON-NEXT: mov w11, #32 // =0x20
+; CHECK-NEON-NEXT: dup v2.2d, x11
+; CHECK-NEON-NEXT: mul x11, x9, x13
+; CHECK-NEON-NEXT: mov w13, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT: str x10, [sp, #296] // 8-byte Spill
+; CHECK-NEON-NEXT: mov w10, #64 // =0x40
+; CHECK-NEON-NEXT: dup v3.2d, x10
+; CHECK-NEON-NEXT: mul x10, x9, x12
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mov w12, #128 // =0x80
+; CHECK-NEON-NEXT: str x11, [sp, #312] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x11, v1.d[1]
+; CHECK-NEON-NEXT: str x10, [sp, #272] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: mul x27, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #256 // =0x100
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #280] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #256] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #512 // =0x200
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #288] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #248] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #1024 // =0x400
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: ldp d25, d18, [sp, #248] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: mov v18.d[1], x27
+; CHECK-NEON-NEXT: mov x27, #137438953472 // =0x2000000000
+; CHECK-NEON-NEXT: str x10, [sp, #264] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #224] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #2048 // =0x800
+; CHECK-NEON-NEXT: ldr d19, [sp, #224] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #232] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #200] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #4096 // =0x1000
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #240] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #192] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #8192 // =0x2000
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: ldp d26, d20, [sp, #192] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #208] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #176] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #16384 // =0x4000
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #216] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #168] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #32768 // =0x8000
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: ldp d17, d21, [sp, #168] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: str x10, [sp, #184] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: str x11, [sp, #152] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #65536 // =0x10000
+; CHECK-NEON-NEXT: ldr d22, [sp, #152] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x15, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #131072 // =0x20000
+; CHECK-NEON-NEXT: str x10, [sp, #144] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: fmov d23, x15
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: str x11, [sp, #120] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #262144 // =0x40000
+; CHECK-NEON-NEXT: mul x17, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: str x10, [sp, #160] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #524288 // =0x80000
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: str x11, [sp, #104] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: str x10, [sp, #128] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: dup v1.2d, x12
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: mov w12, #1048576 // =0x100000
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: str x11, [sp, #80] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: ldr d9, [sp, #80] // 8-byte Reload
+; CHECK-NEON-NEXT: str x10, [sp, #136] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #2097152 // =0x200000
+; CHECK-NEON-NEXT: mul x25, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x30, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: fmov d10, x25
+; CHECK-NEON-NEXT: str x10, [sp, #112] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #4194304 // =0x400000
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: fmov d30, x30
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: str x11, [sp, #48] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: ldr d28, [sp, #48] // 8-byte Reload
+; CHECK-NEON-NEXT: str x10, [sp, #88] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: dup v1.2d, x12
+; CHECK-NEON-NEXT: mov w12, #8388608 // =0x800000
+; CHECK-NEON-NEXT: mul x26, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mul x29, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: str x10, [sp, #96] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #16777216 // =0x1000000
+; CHECK-NEON-NEXT: mul x21, x8, x11
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: fmov d8, x21
+; CHECK-NEON-NEXT: str x10, [sp, #64] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #33554432 // =0x2000000
+; CHECK-NEON-NEXT: mul x7, x8, x11
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov x11, d2
+; CHECK-NEON-NEXT: fmov d11, x7
+; CHECK-NEON-NEXT: str x10, [sp, #72] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: dup v1.2d, x12
+; CHECK-NEON-NEXT: mov w12, #67108864 // =0x4000000
+; CHECK-NEON-NEXT: mul x18, x8, x11
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: fmov d31, x18
+; CHECK-NEON-NEXT: str x10, [sp, #56] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: dup v3.2d, x12
+; CHECK-NEON-NEXT: mov w12, #134217728 // =0x8000000
+; CHECK-NEON-NEXT: mul x1, x8, x11
+; CHECK-NEON-NEXT: mov w11, #536870912 // =0x20000000
+; CHECK-NEON-NEXT: dup v4.2d, x11
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov x11, d3
+; CHECK-NEON-NEXT: str x10, [sp, #24] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: dup v2.2d, x12
+; CHECK-NEON-NEXT: mov w12, #268435456 // =0x10000000
+; CHECK-NEON-NEXT: mul x14, x8, x11
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: str x10, [sp, #32] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v1.d[1]
+; CHECK-NEON-NEXT: dup v1.2d, x12
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mov x12, v1.d[1]
+; CHECK-NEON-NEXT: str x10, [sp, #16] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v3.d[1]
+; CHECK-NEON-NEXT: and v3.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: mul x23, x9, x12
+; CHECK-NEON-NEXT: movi v4.4s, #128, lsl #24
+; CHECK-NEON-NEXT: mov x12, v3.d[1]
+; CHECK-NEON-NEXT: mul x19, x9, x10
+; CHECK-NEON-NEXT: mov x10, v2.d[1]
+; CHECK-NEON-NEXT: fneg v4.2d, v4.2d
+; CHECK-NEON-NEXT: mul x22, x9, x12
+; CHECK-NEON-NEXT: mul x20, x9, x10
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: dup v2.2d, x13
+; CHECK-NEON-NEXT: fmov x13, d1
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mul x16, x8, x10
+; CHECK-NEON-NEXT: mul x0, x8, x13
+; CHECK-NEON-NEXT: fmov x13, d3
+; CHECK-NEON-NEXT: dup v3.2d, x2
+; CHECK-NEON-NEXT: mov x12, v2.d[1]
+; CHECK-NEON-NEXT: mov x2, #8589934592 // =0x200000000
+; CHECK-NEON-NEXT: dup v6.2d, x2
+; CHECK-NEON-NEXT: mul x5, x8, x13
+; CHECK-NEON-NEXT: and v4.16b, v0.16b, v3.16b
+; CHECK-NEON-NEXT: fmov x13, d2
+; CHECK-NEON-NEXT: fmov d2, x4
+; CHECK-NEON-NEXT: fmov d3, x6
+; CHECK-NEON-NEXT: mov x6, #34359738368 // =0x800000000
+; CHECK-NEON-NEXT: mul x10, x9, x12
+; CHECK-NEON-NEXT: and v7.16b, v0.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d6, x24
+; CHECK-NEON-NEXT: mov x12, v1.d[1]
+; CHECK-NEON-NEXT: mov x24, #68719476736 // =0x1000000000
+; CHECK-NEON-NEXT: mul x4, x8, x13
+; CHECK-NEON-NEXT: mov x13, v4.d[1]
+; CHECK-NEON-NEXT: mov v2.d[1], x28
+; CHECK-NEON-NEXT: str x10, [sp, #40] // 8-byte Spill
+; CHECK-NEON-NEXT: ldr x10, [sp, #312] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x28, x9, x12
+; CHECK-NEON-NEXT: fmov x12, d1
+; CHECK-NEON-NEXT: dup v1.2d, x3
+; CHECK-NEON-NEXT: fmov x3, d4
+; CHECK-NEON-NEXT: mov v5.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #296] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x11, x9, x13
+; CHECK-NEON-NEXT: mov x13, v7.d[1]
+; CHECK-NEON-NEXT: dup v4.2d, x6
+; CHECK-NEON-NEXT: mov v3.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #272] // 8-byte Reload
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mul x2, x8, x12
+; CHECK-NEON-NEXT: mov x12, #1099511627776 // =0x10000000000
+; CHECK-NEON-NEXT: mov v6.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #280] // 8-byte Reload
+; CHECK-NEON-NEXT: and v4.16b, v0.16b, v4.16b
+; CHECK-NEON-NEXT: mov x6, v1.d[1]
+; CHECK-NEON-NEXT: eor v2.16b, v5.16b, v2.16b
+; CHECK-NEON-NEXT: mul x3, x8, x3
+; CHECK-NEON-NEXT: mov v25.d[1], x10
+; CHECK-NEON-NEXT: mul x10, x9, x13
+; CHECK-NEON-NEXT: fmov x13, d7
+; CHECK-NEON-NEXT: dup v7.2d, x24
+; CHECK-NEON-NEXT: mov x24, v4.d[1]
+; CHECK-NEON-NEXT: fmov x15, d4
+; CHECK-NEON-NEXT: fmov d4, x17
+; CHECK-NEON-NEXT: eor v5.16b, v3.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d3, x16
+; CHECK-NEON-NEXT: mov x16, #35184372088832 // =0x200000000000
+; CHECK-NEON-NEXT: and v16.16b, v0.16b, v7.16b
+; CHECK-NEON-NEXT: ldr d7, [sp, #120] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v6.16b, v18.16b, v25.16b
+; CHECK-NEON-NEXT: dup v25.2d, x16
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: stp x11, x10, [sp, #272] // 16-byte Folded Spill
+; CHECK-NEON-NEXT: ldr x10, [sp, #288] // 8-byte Reload
+; CHECK-NEON-NEXT: mov x11, #549755813888 // =0x8000000000
+; CHECK-NEON-NEXT: fmov x17, d16
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: mov v3.d[1], x20
+; CHECK-NEON-NEXT: mov v19.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #264] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x24, x9, x24
+; CHECK-NEON-NEXT: mov v20.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #232] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x17, x8, x17
+; CHECK-NEON-NEXT: mov v26.d[1], x10
+; CHECK-NEON-NEXT: eor v6.16b, v6.16b, v19.16b
+; CHECK-NEON-NEXT: mul x10, x9, x6
+; CHECK-NEON-NEXT: fmov x6, d1
+; CHECK-NEON-NEXT: dup v1.2d, x27
+; CHECK-NEON-NEXT: mov x27, v16.d[1]
+; CHECK-NEON-NEXT: dup v16.2d, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #160] // 8-byte Reload
+; CHECK-NEON-NEXT: and v24.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: mov v4.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #128] // 8-byte Reload
+; CHECK-NEON-NEXT: and v29.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: dup v16.2d, x12
+; CHECK-NEON-NEXT: mov x12, #2199023255552 // =0x20000000000
+; CHECK-NEON-NEXT: str x10, [sp, #288] // 8-byte Spill
+; CHECK-NEON-NEXT: ldr x10, [sp, #240] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v19.16b, v20.16b, v26.16b
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v25.16b
+; CHECK-NEON-NEXT: mul x6, x8, x6
+; CHECK-NEON-NEXT: mov v21.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #208] // 8-byte Reload
+; CHECK-NEON-NEXT: and v12.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: fmov d16, x29
+; CHECK-NEON-NEXT: mul x27, x9, x27
+; CHECK-NEON-NEXT: mov v17.d[1], x10
+; CHECK-NEON-NEXT: mov x10, #274877906944 // =0x4000000000
+; CHECK-NEON-NEXT: dup v1.2d, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #216] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v22.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #184] // 8-byte Reload
+; CHECK-NEON-NEXT: and v27.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ldr d1, [sp, #104] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v23.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #144] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v1.d[1], x11
+; CHECK-NEON-NEXT: fmov x11, d24
+; CHECK-NEON-NEXT: mov v7.d[1], x10
+; CHECK-NEON-NEXT: mov x10, v24.d[1]
+; CHECK-NEON-NEXT: fmov d24, x26
+; CHECK-NEON-NEXT: mul x30, x8, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #136] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v22.16b, v22.16b, v23.16b
+; CHECK-NEON-NEXT: mul x25, x9, x10
+; CHECK-NEON-NEXT: mov x10, v27.d[1]
+; CHECK-NEON-NEXT: mov v9.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #112] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v7.16b, v22.16b, v7.16b
+; CHECK-NEON-NEXT: mov v10.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #88] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: eor v4.16b, v7.16b, v4.16b
+; CHECK-NEON-NEXT: fmov d7, x13
+; CHECK-NEON-NEXT: mov v30.d[1], x11
+; CHECK-NEON-NEXT: fmov x11, d27
+; CHECK-NEON-NEXT: dup v27.2d, x12
+; CHECK-NEON-NEXT: mov x12, #4398046511104 // =0x40000000000
+; CHECK-NEON-NEXT: mov x13, #1125899906842624 // =0x4000000000000
+; CHECK-NEON-NEXT: eor v23.16b, v9.16b, v10.16b
+; CHECK-NEON-NEXT: eor v1.16b, v4.16b, v1.16b
+; CHECK-NEON-NEXT: mul x26, x8, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #96] // 8-byte Reload
+; CHECK-NEON-NEXT: and v13.16b, v0.16b, v27.16b
+; CHECK-NEON-NEXT: str x10, [sp, #264] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v29.d[1]
+; CHECK-NEON-NEXT: dup v27.2d, x12
+; CHECK-NEON-NEXT: mov v28.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #64] // 8-byte Reload
+; CHECK-NEON-NEXT: mov x12, #8796093022208 // =0x80000000000
+; CHECK-NEON-NEXT: mov v24.d[1], x11
+; CHECK-NEON-NEXT: fmov x11, d29
+; CHECK-NEON-NEXT: and v14.16b, v0.16b, v27.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: fmov d27, x14
+; CHECK-NEON-NEXT: mov x14, #17592186044416 // =0x100000000000
+; CHECK-NEON-NEXT: fmov d29, x1
+; CHECK-NEON-NEXT: mul x7, x8, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #72] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v27.d[1], x19
+; CHECK-NEON-NEXT: mov v16.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #56] // 8-byte Reload
+; CHECK-NEON-NEXT: str x10, [sp, #256] // 8-byte Spill
+; CHECK-NEON-NEXT: mov x10, v12.d[1]
+; CHECK-NEON-NEXT: mov v8.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #24] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v11.d[1], x11
+; CHECK-NEON-NEXT: fmov x11, d12
+; CHECK-NEON-NEXT: dup v12.2d, x12
+; CHECK-NEON-NEXT: mul x18, x9, x10
+; CHECK-NEON-NEXT: mov x10, v13.d[1]
+; CHECK-NEON-NEXT: and v12.16b, v0.16b, v12.16b
+; CHECK-NEON-NEXT: mul x29, x8, x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #32] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x12, x9, x10
+; CHECK-NEON-NEXT: fmov x10, d13
+; CHECK-NEON-NEXT: dup v13.2d, x14
+; CHECK-NEON-NEXT: mov v31.d[1], x11
+; CHECK-NEON-NEXT: ldr x11, [sp, #16] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x14, x8, x10
+; CHECK-NEON-NEXT: mov x10, v12.d[1]
+; CHECK-NEON-NEXT: and v18.16b, v0.16b, v13.16b
+; CHECK-NEON-NEXT: eor v13.16b, v2.16b, v5.16b
+; CHECK-NEON-NEXT: fmov d2, x0
+; CHECK-NEON-NEXT: mov x0, #70368744177664 // =0x400000000000
+; CHECK-NEON-NEXT: dup v25.2d, x0
+; CHECK-NEON-NEXT: fmov d5, x5
+; CHECK-NEON-NEXT: mov x5, #140737488355328 // =0x800000000000
+; CHECK-NEON-NEXT: mov x16, v18.d[1]
+; CHECK-NEON-NEXT: mov v29.d[1], x11
+; CHECK-NEON-NEXT: mov x11, v14.d[1]
+; CHECK-NEON-NEXT: mul x1, x9, x10
+; CHECK-NEON-NEXT: fmov x10, d12
+; CHECK-NEON-NEXT: eor v26.16b, v13.16b, v6.16b
+; CHECK-NEON-NEXT: eor v6.16b, v19.16b, v21.16b
+; CHECK-NEON-NEXT: dup v19.2d, x5
+; CHECK-NEON-NEXT: mov v5.d[1], x22
+; CHECK-NEON-NEXT: mov v2.d[1], x23
+; CHECK-NEON-NEXT: ldp d13, d12, [sp, #320] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x0, x8, x10
+; CHECK-NEON-NEXT: mov x10, v20.d[1]
+; CHECK-NEON-NEXT: eor v21.16b, v6.16b, v17.16b
+; CHECK-NEON-NEXT: fmov d17, x4
+; CHECK-NEON-NEXT: fmov d6, x2
+; CHECK-NEON-NEXT: mul x16, x9, x16
+; CHECK-NEON-NEXT: mov x2, #281474976710656 // =0x1000000000000
+; CHECK-NEON-NEXT: mov x4, #562949953421312 // =0x2000000000000
+; CHECK-NEON-NEXT: dup v22.2d, x4
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mov v6.d[1], x28
+; CHECK-NEON-NEXT: mul x21, x9, x11
+; CHECK-NEON-NEXT: fmov x11, d14
+; CHECK-NEON-NEXT: ldr d14, [sp, #304] // 8-byte Reload
+; CHECK-NEON-NEXT: str x16, [sp, #312] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x16, d18
+; CHECK-NEON-NEXT: and v18.16b, v0.16b, v25.16b
+; CHECK-NEON-NEXT: dup v25.2d, x2
+; CHECK-NEON-NEXT: str x10, [sp, #296] // 8-byte Spill
+; CHECK-NEON-NEXT: fmov x10, d20
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v19.16b
+; CHECK-NEON-NEXT: mul x19, x8, x16
+; CHECK-NEON-NEXT: mov x16, v18.d[1]
+; CHECK-NEON-NEXT: eor v19.16b, v26.16b, v21.16b
+; CHECK-NEON-NEXT: eor v21.16b, v23.16b, v30.16b
+; CHECK-NEON-NEXT: and v23.16b, v0.16b, v25.16b
+; CHECK-NEON-NEXT: eor v25.16b, v8.16b, v11.16b
+; CHECK-NEON-NEXT: mul x5, x8, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #40] // 8-byte Reload
+; CHECK-NEON-NEXT: dup v26.2d, x13
+; CHECK-NEON-NEXT: eor v1.16b, v19.16b, v1.16b
+; CHECK-NEON-NEXT: fmov d19, x6
+; CHECK-NEON-NEXT: mov v17.d[1], x10
+; CHECK-NEON-NEXT: mov x10, v20.d[1]
+; CHECK-NEON-NEXT: mul x2, x9, x16
+; CHECK-NEON-NEXT: fmov x16, d18
+; CHECK-NEON-NEXT: fmov d18, x3
+; CHECK-NEON-NEXT: eor v21.16b, v21.16b, v28.16b
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: ldp d9, d8, [sp, #352] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: ldp d11, d10, [sp, #336] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x3, x9, x10
+; CHECK-NEON-NEXT: fmov x10, d20
+; CHECK-NEON-NEXT: eor v4.16b, v21.16b, v24.16b
+; CHECK-NEON-NEXT: eor v21.16b, v25.16b, v31.16b
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v22.16b
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v17.16b
+; CHECK-NEON-NEXT: mul x20, x8, x16
+; CHECK-NEON-NEXT: mov x16, v23.d[1]
+; CHECK-NEON-NEXT: fmov d17, x30
+; CHECK-NEON-NEXT: eor v4.16b, v4.16b, v16.16b
+; CHECK-NEON-NEXT: fmov d16, x15
+; CHECK-NEON-NEXT: mov x15, #4503599627370496 // =0x10000000000000
+; CHECK-NEON-NEXT: mul x13, x8, x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #272] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v21.16b, v21.16b, v29.16b
+; CHECK-NEON-NEXT: dup v24.2d, x15
+; CHECK-NEON-NEXT: mov x4, v20.d[1]
+; CHECK-NEON-NEXT: fmov x15, d20
+; CHECK-NEON-NEXT: mov v18.d[1], x10
+; CHECK-NEON-NEXT: ldr x10, [sp, #280] // 8-byte Reload
+; CHECK-NEON-NEXT: mul x23, x9, x16
+; CHECK-NEON-NEXT: eor v21.16b, v21.16b, v27.16b
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v6.16b
+; CHECK-NEON-NEXT: mov v17.d[1], x25
+; CHECK-NEON-NEXT: mov v7.d[1], x10
+; CHECK-NEON-NEXT: mov x10, #2251799813685248 // =0x8000000000000
+; CHECK-NEON-NEXT: mov x25, #18014398509481984 // =0x40000000000000
+; CHECK-NEON-NEXT: dup v22.2d, x10
+; CHECK-NEON-NEXT: fmov x10, d23
+; CHECK-NEON-NEXT: and v23.16b, v0.16b, v26.16b
+; CHECK-NEON-NEXT: eor v3.16b, v21.16b, v3.16b
+; CHECK-NEON-NEXT: mul x6, x9, x4
+; CHECK-NEON-NEXT: ldr x4, [sp, #288] // 8-byte Reload
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: mov v16.d[1], x24
+; CHECK-NEON-NEXT: fmov d4, x17
+; CHECK-NEON-NEXT: mov x16, v23.d[1]
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v22.16b
+; CHECK-NEON-NEXT: and v22.16b, v0.16b, v24.16b
+; CHECK-NEON-NEXT: mov v19.d[1], x4
+; CHECK-NEON-NEXT: fmov x4, d23
+; CHECK-NEON-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v18.16b
+; CHECK-NEON-NEXT: fmov d23, x26
+; CHECK-NEON-NEXT: fmov d18, x7
+; CHECK-NEON-NEXT: mov x24, v20.d[1]
+; CHECK-NEON-NEXT: fmov d6, x29
+; CHECK-NEON-NEXT: fmov d5, x14
+; CHECK-NEON-NEXT: mul x22, x9, x16
+; CHECK-NEON-NEXT: mov x16, #9007199254740992 // =0x20000000000000
+; CHECK-NEON-NEXT: mov x14, #72057594037927936 // =0x100000000000000
+; CHECK-NEON-NEXT: dup v21.2d, x16
+; CHECK-NEON-NEXT: mov x16, v22.d[1]
+; CHECK-NEON-NEXT: eor v3.16b, v3.16b, v7.16b
+; CHECK-NEON-NEXT: mul x17, x8, x4
+; CHECK-NEON-NEXT: fmov x4, d20
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: dup v2.2d, x14
+; CHECK-NEON-NEXT: mov v6.d[1], x18
+; CHECK-NEON-NEXT: mov v5.d[1], x12
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v21.16b
+; CHECK-NEON-NEXT: dup v21.2d, x25
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: mov v4.d[1], x27
+; CHECK-NEON-NEXT: ldp x29, x30, [sp, #368] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x7, x9, x16
+; CHECK-NEON-NEXT: mov x16, #36028797018963968 // =0x80000000000000
+; CHECK-NEON-NEXT: dup v7.2d, x16
+; CHECK-NEON-NEXT: ldr x16, [sp, #264] // 8-byte Reload
+; CHECK-NEON-NEXT: and v21.16b, v0.16b, v21.16b
+; CHECK-NEON-NEXT: mov x26, v20.d[1]
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: mov v23.d[1], x16
+; CHECK-NEON-NEXT: ldr x16, [sp, #256] // 8-byte Reload
+; CHECK-NEON-NEXT: mov x18, v21.d[1]
+; CHECK-NEON-NEXT: mul x27, x8, x4
+; CHECK-NEON-NEXT: fmov x4, d22
+; CHECK-NEON-NEXT: mov v18.d[1], x16
+; CHECK-NEON-NEXT: fmov x16, d20
+; CHECK-NEON-NEXT: and v20.16b, v0.16b, v7.16b
+; CHECK-NEON-NEXT: eor v7.16b, v3.16b, v19.16b
+; CHECK-NEON-NEXT: and v19.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: fmov d2, x0
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: mul x24, x9, x24
+; CHECK-NEON-NEXT: mov x12, v20.d[1]
+; CHECK-NEON-NEXT: fmov x0, d20
+; CHECK-NEON-NEXT: fmov d20, x13
+; CHECK-NEON-NEXT: mul x14, x8, x16
+; CHECK-NEON-NEXT: fmov x16, d21
+; CHECK-NEON-NEXT: eor v7.16b, v7.16b, v16.16b
+; CHECK-NEON-NEXT: eor v16.16b, v17.16b, v23.16b
+; CHECK-NEON-NEXT: fmov d17, x20
+; CHECK-NEON-NEXT: mov x20, v19.d[1]
+; CHECK-NEON-NEXT: mul x11, x9, x18
+; CHECK-NEON-NEXT: mov x18, #144115188075855872 // =0x200000000000000
+; CHECK-NEON-NEXT: mov v20.d[1], x3
+; CHECK-NEON-NEXT: dup v21.2d, x18
+; CHECK-NEON-NEXT: mov v3.d[1], x21
+; CHECK-NEON-NEXT: mov v2.d[1], x1
+; CHECK-NEON-NEXT: mul x18, x8, x16
+; CHECK-NEON-NEXT: mov v17.d[1], x2
+; CHECK-NEON-NEXT: eor v16.16b, v16.16b, v18.16b
+; CHECK-NEON-NEXT: mov x2, #576460752303423488 // =0x800000000000000
+; CHECK-NEON-NEXT: mov x3, #2305843009213693952 // =0x2000000000000000
+; CHECK-NEON-NEXT: eor v4.16b, v7.16b, v4.16b
+; CHECK-NEON-NEXT: mul x16, x9, x12
+; CHECK-NEON-NEXT: mov x12, #288230376151711744 // =0x400000000000000
+; CHECK-NEON-NEXT: and v21.16b, v0.16b, v21.16b
+; CHECK-NEON-NEXT: dup v18.2d, x12
+; CHECK-NEON-NEXT: fmov x12, d19
+; CHECK-NEON-NEXT: fmov d19, x10
+; CHECK-NEON-NEXT: eor v6.16b, v16.16b, v6.16b
+; CHECK-NEON-NEXT: dup v16.2d, x2
+; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v20.16b
+; CHECK-NEON-NEXT: fmov d20, x15
+; CHECK-NEON-NEXT: mul x13, x8, x0
+; CHECK-NEON-NEXT: mov x10, v21.d[1]
+; CHECK-NEON-NEXT: mov v19.d[1], x23
+; CHECK-NEON-NEXT: and v18.16b, v0.16b, v18.16b
+; CHECK-NEON-NEXT: fmov x15, d21
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: and v16.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: fmov d21, x17
+; CHECK-NEON-NEXT: mov v20.d[1], x6
+; CHECK-NEON-NEXT: mov x17, #1152921504606846976 // =0x1000000000000000
+; CHECK-NEON-NEXT: eor v5.16b, v6.16b, v5.16b
+; CHECK-NEON-NEXT: mul x0, x9, x20
+; CHECK-NEON-NEXT: mov x1, v18.d[1]
+; CHECK-NEON-NEXT: dup v7.2d, x3
+; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v19.16b
+; CHECK-NEON-NEXT: mov x2, v16.d[1]
+; CHECK-NEON-NEXT: dup v19.2d, x17
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: mov v21.d[1], x22
+; CHECK-NEON-NEXT: fmov x17, d18
+; CHECK-NEON-NEXT: fmov d18, x27
+; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v3.16b
+; CHECK-NEON-NEXT: and v7.16b, v0.16b, v7.16b
+; CHECK-NEON-NEXT: eor v6.16b, v17.16b, v20.16b
+; CHECK-NEON-NEXT: fmov d17, x12
+; CHECK-NEON-NEXT: mul x25, x8, x4
+; CHECK-NEON-NEXT: and v19.16b, v0.16b, v19.16b
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v4.16b
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mov v18.d[1], x24
+; CHECK-NEON-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT: mov v17.d[1], x0
+; CHECK-NEON-NEXT: eor v5.16b, v6.16b, v21.16b
+; CHECK-NEON-NEXT: movi v6.2d, #0000000000000000
+; CHECK-NEON-NEXT: mul x0, x9, x2
+; CHECK-NEON-NEXT: fmov x2, d16
+; CHECK-NEON-NEXT: fmov v16.2d, #2.00000000
+; CHECK-NEON-NEXT: fmov d20, x15
+; CHECK-NEON-NEXT: mov x12, v19.d[1]
+; CHECK-NEON-NEXT: fmov d21, x25
+; CHECK-NEON-NEXT: mul x17, x8, x17
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v18.16b
+; CHECK-NEON-NEXT: fneg v6.2d, v6.2d
+; CHECK-NEON-NEXT: ldp x22, x21, [sp, #432] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x15, x8, x2
+; CHECK-NEON-NEXT: fmov x2, d19
+; CHECK-NEON-NEXT: and v16.16b, v0.16b, v16.16b
+; CHECK-NEON-NEXT: mov v20.d[1], x10
+; CHECK-NEON-NEXT: mov x10, v7.d[1]
+; CHECK-NEON-NEXT: mov v21.d[1], x7
+; CHECK-NEON-NEXT: mul x1, x9, x1
+; CHECK-NEON-NEXT: ldp x24, x23, [sp, #416] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: fmov d19, x17
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d6, x14
+; CHECK-NEON-NEXT: mul x17, x8, x2
+; CHECK-NEON-NEXT: ldr x14, [sp, #312] // 8-byte Reload
+; CHECK-NEON-NEXT: fmov d18, x15
+; CHECK-NEON-NEXT: mov x15, v16.d[1]
+; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v20.16b
+; CHECK-NEON-NEXT: mul x4, x9, x26
+; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v21.16b
+; CHECK-NEON-NEXT: mov v19.d[1], x1
+; CHECK-NEON-NEXT: fmov x1, d7
+; CHECK-NEON-NEXT: fmov d7, x19
+; CHECK-NEON-NEXT: mul x12, x9, x12
+; CHECK-NEON-NEXT: mov v18.d[1], x0
+; CHECK-NEON-NEXT: ldp x20, x19, [sp, #448] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x0, x8, x1
+; CHECK-NEON-NEXT: mov v7.d[1], x14
+; CHECK-NEON-NEXT: eor v5.16b, v17.16b, v19.16b
+; CHECK-NEON-NEXT: fmov d17, x17
+; CHECK-NEON-NEXT: fmov x17, d0
+; CHECK-NEON-NEXT: mul x14, x9, x15
+; CHECK-NEON-NEXT: fmov x15, d16
+; CHECK-NEON-NEXT: mov v6.d[1], x4
+; CHECK-NEON-NEXT: fmov d16, x5
+; CHECK-NEON-NEXT: ldp x26, x25, [sp, #400] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x10, x9, x10
+; CHECK-NEON-NEXT: mov v17.d[1], x12
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v18.16b
+; CHECK-NEON-NEXT: mov x12, v0.d[1]
+; CHECK-NEON-NEXT: fmov d0, x18
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v7.16b
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: fmov d7, x0
+; CHECK-NEON-NEXT: eor v3.16b, v3.16b, v6.16b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: ldp x28, x27, [sp, #384] // 16-byte Folded Reload
+; CHECK-NEON-NEXT: mul x8, x8, x17
+; CHECK-NEON-NEXT: ldr x17, [sp, #296] // 8-byte Reload
+; CHECK-NEON-NEXT: mov v0.d[1], x11
+; CHECK-NEON-NEXT: mov v7.d[1], x10
+; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v17.16b
+; CHECK-NEON-NEXT: mov v16.d[1], x17
+; CHECK-NEON-NEXT: mul x9, x9, x12
+; CHECK-NEON-NEXT: mov v6.d[1], x16
+; CHECK-NEON-NEXT: fmov d17, x15
+; CHECK-NEON-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEON-NEXT: fmov d4, x8
+; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v7.16b
+; CHECK-NEON-NEXT: mov v17.d[1], x14
+; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v16.16b
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v6.16b
+; CHECK-NEON-NEXT: mov v4.d[1], x9
+; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: eor v2.16b, v3.16b, v17.16b
+; CHECK-NEON-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: eor v1.16b, v2.16b, v4.16b
+; CHECK-NEON-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: rev64 v0.16b, v0.16b
+; CHECK-NEON-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEON-NEXT: ushr v0.2d, v0.2d, #1
+; CHECK-NEON-NEXT: add sp, sp, #464
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-AES-LABEL: clmulh_v2i64_neon:
+; CHECK-AES: // %bb.0:
+; CHECK-AES-NEXT: rev64 v1.16b, v1.16b
+; CHECK-AES-NEXT: rev64 v0.16b, v0.16b
+; CHECK-AES-NEXT: rbit v1.16b, v1.16b
+; CHECK-AES-NEXT: rbit v0.16b, v0.16b
+; CHECK-AES-NEXT: pmull2 v2.1q, v0.2d, v1.2d
+; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-AES-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-AES-NEXT: rev64 v0.16b, v0.16b
+; CHECK-AES-NEXT: rbit v0.16b, v0.16b
+; CHECK-AES-NEXT: ushr v0.2d, v0.2d, #1
+; CHECK-AES-NEXT: ret
+ %a.ext = zext <2 x i64> %a to <2 x i128>
+ %b.ext = zext <2 x i64> %b to <2 x i128>
+ %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+ %res.ext = lshr <2 x i128> %clmul, splat (i128 64)
+ %res = trunc <2 x i128> %res.ext to <2 x i64>
+ ret <2 x i64> %res
+}
-; TODO
-;define <1 x i64> @clmulh_v1i64_neon(<1 x i64> %a, <1 x i64> %b) nounwind {
-; %a.ext = zext <1 x i64> %a to <1 x i128>
-; %b.ext = zext <1 x i64> %b to <1 x i128>
-; %clmul = call <1 x i128> @llvm.clmul.v1i128(<1 x i128> %a.ext, <1 x i128> %b.ext)
-; %res.ext = lshr <1 x i128> %clmul, splat (i128 64)
-; %res = trunc <1 x i128> %res.ext to <1 x i64>
-; ret <1 x i64> %res
-;}
+define <1 x i64> @clmulh_v1i64_neon(<1 x i64> %a, <1 x i64> %b) nounwind {
+; CHECK-NEON-LABEL: clmulh_v1i64_neon:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: rev64 v1.8b, v1.8b
+; CHECK-NEON-NEXT: mov w8, #2 // =0x2
+; CHECK-NEON-NEXT: rev64 v2.8b, v0.8b
+; CHECK-NEON-NEXT: mov w10, #8 // =0x8
+; CHECK-NEON-NEXT: mov w11, #16 // =0x10
+; CHECK-NEON-NEXT: mov w12, #32 // =0x20
+; CHECK-NEON-NEXT: mov w13, #64 // =0x40
+; CHECK-NEON-NEXT: mov w14, #128 // =0x80
+; CHECK-NEON-NEXT: mov w15, #256 // =0x100
+; CHECK-NEON-NEXT: rbit v0.8b, v1.8b
+; CHECK-NEON-NEXT: fmov d1, x8
+; CHECK-NEON-NEXT: mov w8, #1 // =0x1
+; CHECK-NEON-NEXT: fmov d3, x8
+; CHECK-NEON-NEXT: rbit v2.8b, v2.8b
+; CHECK-NEON-NEXT: mov w8, #4 // =0x4
+; CHECK-NEON-NEXT: fmov d4, x8
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v1.8b
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov x8, d2
+; CHECK-NEON-NEXT: fmov d2, x10
+; CHECK-NEON-NEXT: fmov x9, d1
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x10, d3
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: fmov d4, x14
+; CHECK-NEON-NEXT: mov w14, #512 // =0x200
+; CHECK-NEON-NEXT: fmov x11, d1
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: fmov d5, x14
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d3, x12
+; CHECK-NEON-NEXT: fmov x12, d2
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d3, x13
+; CHECK-NEON-NEXT: fmov x13, d1
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: fmov d3, x15
+; CHECK-NEON-NEXT: fmov x14, d2
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: mov w15, #1024 // =0x400
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: fmov x9, d1
+; CHECK-NEON-NEXT: and v1.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: mov w11, #2048 // =0x800
+; CHECK-NEON-NEXT: eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d5, x15
+; CHECK-NEON-NEXT: fmov x12, d3
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v6.8b
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: fmov x12, d1
+; CHECK-NEON-NEXT: fmov d1, x13
+; CHECK-NEON-NEXT: mov w13, #4096 // =0x1000
+; CHECK-NEON-NEXT: eor v2.8b, v4.8b, v2.8b
+; CHECK-NEON-NEXT: fmov d4, x14
+; CHECK-NEON-NEXT: fmov x14, d5
+; CHECK-NEON-NEXT: fmov d5, x13
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v4.8b
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: mov w11, #8192 // =0x2000
+; CHECK-NEON-NEXT: fmov x10, d3
+; CHECK-NEON-NEXT: mul x13, x8, x14
+; CHECK-NEON-NEXT: eor v3.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov w11, #16384 // =0x4000
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: eor v1.8b, v2.8b, v1.8b
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: mov w9, #32768 // =0x8000
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: mul x9, x8, x10
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: mov w10, #65536 // =0x10000
+; CHECK-NEON-NEXT: fmov d2, x10
+; CHECK-NEON-NEXT: mov w10, #131072 // =0x20000
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: fmov d4, x10
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: mov w11, #262144 // =0x40000
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x9, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: mov w11, #524288 // =0x80000
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov x12, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov w11, #1048576 // =0x100000
+; CHECK-NEON-NEXT: and v2.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x10, d2
+; CHECK-NEON-NEXT: fmov d2, x11
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: mul x13, x8, x10
+; CHECK-NEON-NEXT: mov w10, #2097152 // =0x200000
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: mov w10, #4194304 // =0x400000
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: fmov x9, d6
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mul x10, x8, x9
+; CHECK-NEON-NEXT: mov w9, #8388608 // =0x800000
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov x12, d3
+; CHECK-NEON-NEXT: fmov d3, x9
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: fmov x14, d5
+; CHECK-NEON-NEXT: mul x9, x8, x12
+; CHECK-NEON-NEXT: mov w12, #16777216 // =0x1000000
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: mov w12, #33554432 // =0x2000000
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: fmov x13, d3
+; CHECK-NEON-NEXT: fmov d3, x12
+; CHECK-NEON-NEXT: mov w12, #67108864 // =0x4000000
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: and v7.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: eor v3.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: fmov x15, d5
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x11, d7
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: mul x12, x8, x15
+; CHECK-NEON-NEXT: mov w15, #134217728 // =0x8000000
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d6, x15
+; CHECK-NEON-NEXT: mov w15, #536870912 // =0x20000000
+; CHECK-NEON-NEXT: fmov x14, d4
+; CHECK-NEON-NEXT: fmov d16, x13
+; CHECK-NEON-NEXT: fmov d17, x15
+; CHECK-NEON-NEXT: movi v4.2s, #128, lsl #24
+; CHECK-NEON-NEXT: mov w15, #1073741824 // =0x40000000
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v6.8b
+; CHECK-NEON-NEXT: mul x13, x8, x14
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT: fmov d16, x15
+; CHECK-NEON-NEXT: mov w15, #268435456 // =0x10000000
+; CHECK-NEON-NEXT: fmov x14, d6
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fneg d4, d4
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: fmov d5, x15
+; CHECK-NEON-NEXT: fmov x15, d6
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: mul x12, x8, x15
+; CHECK-NEON-NEXT: eor v2.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT: fmov x15, d16
+; CHECK-NEON-NEXT: fmov d6, x10
+; CHECK-NEON-NEXT: mul x10, x8, x15
+; CHECK-NEON-NEXT: mov x15, #4294967296 // =0x100000000
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: fmov x11, d5
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x15
+; CHECK-NEON-NEXT: fmov d7, x12
+; CHECK-NEON-NEXT: mov x15, #281474976710656 // =0x1000000000000
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: mov x13, #8589934592 // =0x200000000
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d17, x13
+; CHECK-NEON-NEXT: fmov d16, x10
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v5.8b
+; CHECK-NEON-NEXT: mov x13, #549755813888 // =0x8000000000
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x14
+; CHECK-NEON-NEXT: fmov x10, d4
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: mov x14, #17592186044416 // =0x100000000000
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v3.8b
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x9
+; CHECK-NEON-NEXT: mov x9, #17179869184 // =0x400000000
+; CHECK-NEON-NEXT: mul x10, x8, x10
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: eor v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: mov x11, #34359738368 // =0x800000000
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: mov x10, #137438953472 // =0x2000000000
+; CHECK-NEON-NEXT: fmov x11, d4
+; CHECK-NEON-NEXT: fmov d4, x10
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: eor v5.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: fmov d6, x9
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: mov x11, #274877906944 // =0x4000000000
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x9, d3
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: mov x11, #68719476736 // =0x1000000000
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: mov x13, #1099511627776 // =0x10000000000
+; CHECK-NEON-NEXT: fmov x12, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: mul x9, x8, x9
+; CHECK-NEON-NEXT: fmov d7, x10
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: and v4.8b, v0.8b, v4.8b
+; CHECK-NEON-NEXT: fmov x12, d3
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: eor v2.8b, v5.8b, v7.8b
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: mov x14, #35184372088832 // =0x200000000000
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v6.8b
+; CHECK-NEON-NEXT: fmov x10, d3
+; CHECK-NEON-NEXT: fmov d3, x9
+; CHECK-NEON-NEXT: fmov x9, d4
+; CHECK-NEON-NEXT: fmov d4, x11
+; CHECK-NEON-NEXT: mov x11, #2199023255552 // =0x20000000000
+; CHECK-NEON-NEXT: mul x13, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d6
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: mov x11, #4398046511104 // =0x40000000000
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: mul x12, x8, x10
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v6.8b
+; CHECK-NEON-NEXT: mul x10, x8, x9
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: mov x9, #8796093022208 // =0x80000000000
+; CHECK-NEON-NEXT: fmov x11, d5
+; CHECK-NEON-NEXT: fmov d5, x9
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: fmov x12, d3
+; CHECK-NEON-NEXT: mul x9, x8, x11
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mov x11, #70368744177664 // =0x400000000000
+; CHECK-NEON-NEXT: fmov d3, x11
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v7.8b
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: mov x12, #140737488355328 // =0x800000000000
+; CHECK-NEON-NEXT: fmov x13, d5
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d16, x9
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mul x12, x8, x13
+; CHECK-NEON-NEXT: fmov x13, d3
+; CHECK-NEON-NEXT: fmov d3, x14
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v16.8b
+; CHECK-NEON-NEXT: fmov x14, d5
+; CHECK-NEON-NEXT: fmov d5, x15
+; CHECK-NEON-NEXT: mov x15, #562949953421312 // =0x2000000000000
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: fmov d7, x15
+; CHECK-NEON-NEXT: fmov x15, d6
+; CHECK-NEON-NEXT: and v6.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: fmov d3, x10
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: and v7.8b, v0.8b, v7.8b
+; CHECK-NEON-NEXT: mul x9, x8, x15
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEON-NEXT: fmov x10, d5
+; CHECK-NEON-NEXT: fmov d5, x11
+; CHECK-NEON-NEXT: fmov x11, d6
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: mov x13, #1125899906842624 // =0x4000000000000
+; CHECK-NEON-NEXT: fmov d16, x13
+; CHECK-NEON-NEXT: mov x13, #2251799813685248 // =0x8000000000000
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: mul x15, x8, x10
+; CHECK-NEON-NEXT: fmov x10, d7
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: fmov d17, x13
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: mul x14, x8, x10
+; CHECK-NEON-NEXT: eor v7.8b, v6.8b, v7.8b
+; CHECK-NEON-NEXT: fmov d6, x12
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: mov x11, #4503599627370496 // =0x10000000000000
+; CHECK-NEON-NEXT: fmov x12, d16
+; CHECK-NEON-NEXT: fmov d16, x11
+; CHECK-NEON-NEXT: fmov d18, x15
+; CHECK-NEON-NEXT: mov x15, #288230376151711744 // =0x400000000000000
+; CHECK-NEON-NEXT: fmov x13, d17
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: mul x11, x8, x12
+; CHECK-NEON-NEXT: mov x12, #9007199254740992 // =0x20000000000000
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: fmov d17, x12
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d18, x14
+; CHECK-NEON-NEXT: mul x12, x8, x13
+; CHECK-NEON-NEXT: mov x13, #72057594037927936 // =0x100000000000000
+; CHECK-NEON-NEXT: fmov x14, d16
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fmov d16, x13
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d18, x11
+; CHECK-NEON-NEXT: mul x13, x8, x14
+; CHECK-NEON-NEXT: mov x14, #144115188075855872 // =0x200000000000000
+; CHECK-NEON-NEXT: fmov x11, d17
+; CHECK-NEON-NEXT: fmov d17, x14
+; CHECK-NEON-NEXT: mov x14, #18014398509481984 // =0x40000000000000
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d18, x14
+; CHECK-NEON-NEXT: mul x11, x8, x11
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fmov x14, d16
+; CHECK-NEON-NEXT: and v16.8b, v0.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d18, x12
+; CHECK-NEON-NEXT: fmov x12, d17
+; CHECK-NEON-NEXT: fmov d17, x15
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: mul x15, x8, x12
+; CHECK-NEON-NEXT: mov x12, #576460752303423488 // =0x800000000000000
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fmov d5, x12
+; CHECK-NEON-NEXT: fmov x12, d16
+; CHECK-NEON-NEXT: fmov d6, x14
+; CHECK-NEON-NEXT: and v3.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: eor v5.8b, v7.8b, v18.8b
+; CHECK-NEON-NEXT: fmov d7, x13
+; CHECK-NEON-NEXT: fmov x13, d17
+; CHECK-NEON-NEXT: fmov d16, x15
+; CHECK-NEON-NEXT: mov x15, #1152921504606846976 // =0x1000000000000000
+; CHECK-NEON-NEXT: mul x12, x8, x12
+; CHECK-NEON-NEXT: fmov x14, d3
+; CHECK-NEON-NEXT: eor v3.8b, v5.8b, v7.8b
+; CHECK-NEON-NEXT: fmov d5, x15
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: mov x15, #2305843009213693952 // =0x2000000000000000
+; CHECK-NEON-NEXT: eor v6.8b, v6.8b, v16.8b
+; CHECK-NEON-NEXT: fmov d7, x15
+; CHECK-NEON-NEXT: mov x15, #36028797018963968 // =0x80000000000000
+; CHECK-NEON-NEXT: movi d16, #0000000000000000
+; CHECK-NEON-NEXT: mul x14, x8, x14
+; CHECK-NEON-NEXT: and v5.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d17, x15
+; CHECK-NEON-NEXT: and v7.8b, v0.8b, v7.8b
+; CHECK-NEON-NEXT: fmov d18, x13
+; CHECK-NEON-NEXT: fmov x13, d5
+; CHECK-NEON-NEXT: and v17.8b, v0.8b, v17.8b
+; CHECK-NEON-NEXT: fneg d16, d16
+; CHECK-NEON-NEXT: fmov d5, x14
+; CHECK-NEON-NEXT: mov x14, #4611686018427387904 // =0x4000000000000000
+; CHECK-NEON-NEXT: fmov x15, d7
+; CHECK-NEON-NEXT: eor v6.8b, v6.8b, v18.8b
+; CHECK-NEON-NEXT: mul x13, x8, x13
+; CHECK-NEON-NEXT: fmov d7, x14
+; CHECK-NEON-NEXT: fmov x14, d17
+; CHECK-NEON-NEXT: fmov d17, x9
+; CHECK-NEON-NEXT: mul x15, x8, x15
+; CHECK-NEON-NEXT: eor v5.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT: fmov d6, x11
+; CHECK-NEON-NEXT: and v7.8b, v0.8b, v7.8b
+; CHECK-NEON-NEXT: and v0.8b, v0.8b, v16.8b
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v17.8b
+; CHECK-NEON-NEXT: mul x9, x8, x14
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x13
+; CHECK-NEON-NEXT: fmov x11, d7
+; CHECK-NEON-NEXT: eor v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d6, x10
+; CHECK-NEON-NEXT: mul x10, x8, x11
+; CHECK-NEON-NEXT: fmov x11, d0
+; CHECK-NEON-NEXT: fmov d0, x15
+; CHECK-NEON-NEXT: eor v2.8b, v4.8b, v6.8b
+; CHECK-NEON-NEXT: fmov d4, x12
+; CHECK-NEON-NEXT: mul x8, x8, x11
+; CHECK-NEON-NEXT: eor v0.8b, v5.8b, v0.8b
+; CHECK-NEON-NEXT: fmov d5, x10
+; CHECK-NEON-NEXT: eor v3.8b, v3.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d4, x9
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v5.8b
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v4.8b
+; CHECK-NEON-NEXT: fmov d3, x8
+; CHECK-NEON-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT: rev64 v0.8b, v0.8b
+; CHECK-NEON-NEXT: rbit v0.8b, v0.8b
+; CHECK-NEON-NEXT: ushr d0, d0, #1
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-AES-LABEL: clmulh_v1i64_neon:
+; CHECK-AES: // %bb.0:
+; CHECK-AES-NEXT: rev64 v1.8b, v1.8b
+; CHECK-AES-NEXT: rev64 v0.8b, v0.8b
+; CHECK-AES-NEXT: rbit v1.8b, v1.8b
+; CHECK-AES-NEXT: rbit v0.8b, v0.8b
+; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-AES-NEXT: rev64 v0.8b, v0.8b
+; CHECK-AES-NEXT: rbit v0.8b, v0.8b
+; CHECK-AES-NEXT: ushr d0, d0, #1
+; CHECK-AES-NEXT: ret
+ %a.ext = zext <1 x i64> %a to <1 x i128>
+ %b.ext = zext <1 x i64> %b to <1 x i128>
+ %clmul = call <1 x i128> @llvm.clmul.v1i128(<1 x i128> %a.ext, <1 x i128> %b.ext)
+ %res.ext = lshr <1 x i128> %clmul, splat (i128 64)
+ %res = trunc <1 x i128> %res.ext to <1 x i64>
+ ret <1 x i64> %res
+}
More information about the llvm-commits
mailing list