[llvm] Add clmul zext AArch64 lowering tests (PR #179641)

Matthew Devereau via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 4 07:09:30 PST 2026


https://github.com/MDevereau updated https://github.com/llvm/llvm-project/pull/179641

>From a0269637aef5fa53cc2807fd9ef84469d4f1dd11 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Wed, 4 Feb 2026 11:05:48 +0000
Subject: [PATCH] Add clmul zext AArch64 lowering tests

---
 llvm/test/CodeGen/AArch64/clmul-fixed.ll    | 9361 ++++++++++++++++++-
 llvm/test/CodeGen/AArch64/clmul-scalable.ll | 3354 ++++---
 llvm/test/CodeGen/AArch64/clmul.ll          |  756 ++
 3 files changed, 11917 insertions(+), 1554 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
index ff5eeb6cdb43d..fc1935b99de3f 100644
--- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
@@ -1,432 +1,839 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-unknown-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-unknown-gnu %s -o - | FileCheck %s --check-prefix=NEON
+; RUN: llc -mtriple=aarch64-linux-unknown-gnu %s -o - -mattr=+aes | FileCheck %s --check-prefix=NEON-AES
 
 define <16 x i8> @clmul_v16i8_neon(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: clmul_v16i8_neon:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    pmul v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; NEON-LABEL: clmul_v16i8_neon:
+; NEON:       // %bb.0:
+; NEON-NEXT:    pmul v0.16b, v0.16b, v1.16b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v16i8_neon:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    pmul v0.16b, v0.16b, v1.16b
+; NEON-AES-NEXT:    ret
   %a = call <16 x i8> @llvm.clmul.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %a
 }
 
 define <8 x i8> @clmul_v8i8_neon(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-LABEL: clmul_v8i8_neon:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    pmul v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; NEON-LABEL: clmul_v8i8_neon:
+; NEON:       // %bb.0:
+; NEON-NEXT:    pmul v0.8b, v0.8b, v1.8b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v8i8_neon:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    pmul v0.8b, v0.8b, v1.8b
+; NEON-AES-NEXT:    ret
   %a = call <8 x i8> @llvm.clmul.v8i8(<8 x i8> %x, <8 x i8> %y)
   ret <8 x i8> %a
 }
 
 define <8 x i16> @clmul_v8i16_neon(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: clmul_v8i16_neon:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.8h, #2
-; CHECK-NEXT:    movi v3.8h, #1
-; CHECK-NEXT:    movi v4.8h, #4
-; CHECK-NEXT:    movi v5.8h, #8
-; CHECK-NEXT:    movi v6.8h, #16
-; CHECK-NEXT:    movi v7.8h, #32
-; CHECK-NEXT:    movi v16.8h, #128
-; CHECK-NEXT:    movi v17.8h, #1, lsl #8
-; CHECK-NEXT:    movi v18.8h, #8, lsl #8
-; CHECK-NEXT:    movi v19.8h, #16, lsl #8
-; CHECK-NEXT:    movi v20.8h, #64
-; CHECK-NEXT:    movi v21.8h, #2, lsl #8
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    movi v22.8h, #32, lsl #8
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    mul v2.8h, v0.8h, v2.8h
-; CHECK-NEXT:    mul v3.8h, v0.8h, v3.8h
-; CHECK-NEXT:    mul v4.8h, v0.8h, v4.8h
-; CHECK-NEXT:    mul v5.8h, v0.8h, v5.8h
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    movi v23.8h, #4, lsl #8
-; CHECK-NEXT:    movi v24.8h, #64, lsl #8
-; CHECK-NEXT:    mul v6.8h, v0.8h, v6.8h
-; CHECK-NEXT:    mul v7.8h, v0.8h, v7.8h
-; CHECK-NEXT:    mul v16.8h, v0.8h, v16.8h
-; CHECK-NEXT:    mul v17.8h, v0.8h, v17.8h
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    mul v18.8h, v0.8h, v18.8h
-; CHECK-NEXT:    mul v19.8h, v0.8h, v19.8h
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    mul v4.8h, v0.8h, v20.8h
-; CHECK-NEXT:    movi v20.8h, #128, lsl #8
-; CHECK-NEXT:    mul v5.8h, v0.8h, v21.8h
-; CHECK-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEXT:    and v23.16b, v1.16b, v24.16b
-; CHECK-NEXT:    mul v22.8h, v0.8h, v22.8h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v20.16b
-; CHECK-NEXT:    mul v3.8h, v0.8h, v21.8h
-; CHECK-NEXT:    mul v17.8h, v0.8h, v23.8h
-; CHECK-NEXT:    eor v4.16b, v6.16b, v4.16b
-; CHECK-NEXT:    eor v5.16b, v7.16b, v5.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v22.16b
-; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-NEXT:    eor v2.16b, v5.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v6.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ret
+; NEON-LABEL: clmul_v8i16_neon:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v2.8h, #2
+; NEON-NEXT:    movi v3.8h, #1
+; NEON-NEXT:    movi v4.8h, #4
+; NEON-NEXT:    movi v5.8h, #8
+; NEON-NEXT:    movi v6.8h, #16
+; NEON-NEXT:    movi v7.8h, #32
+; NEON-NEXT:    movi v16.8h, #128
+; NEON-NEXT:    movi v17.8h, #1, lsl #8
+; NEON-NEXT:    movi v18.8h, #8, lsl #8
+; NEON-NEXT:    movi v19.8h, #16, lsl #8
+; NEON-NEXT:    movi v20.8h, #64
+; NEON-NEXT:    movi v21.8h, #2, lsl #8
+; NEON-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-NEXT:    movi v22.8h, #32, lsl #8
+; NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-NEXT:    mul v2.8h, v0.8h, v2.8h
+; NEON-NEXT:    mul v3.8h, v0.8h, v3.8h
+; NEON-NEXT:    mul v4.8h, v0.8h, v4.8h
+; NEON-NEXT:    mul v5.8h, v0.8h, v5.8h
+; NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-NEXT:    movi v23.8h, #4, lsl #8
+; NEON-NEXT:    movi v24.8h, #64, lsl #8
+; NEON-NEXT:    mul v6.8h, v0.8h, v6.8h
+; NEON-NEXT:    mul v7.8h, v0.8h, v7.8h
+; NEON-NEXT:    mul v16.8h, v0.8h, v16.8h
+; NEON-NEXT:    mul v17.8h, v0.8h, v17.8h
+; NEON-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-NEXT:    mul v18.8h, v0.8h, v18.8h
+; NEON-NEXT:    mul v19.8h, v0.8h, v19.8h
+; NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-NEXT:    mul v4.8h, v0.8h, v20.8h
+; NEON-NEXT:    movi v20.8h, #128, lsl #8
+; NEON-NEXT:    mul v5.8h, v0.8h, v21.8h
+; NEON-NEXT:    and v21.16b, v1.16b, v23.16b
+; NEON-NEXT:    and v23.16b, v1.16b, v24.16b
+; NEON-NEXT:    mul v22.8h, v0.8h, v22.8h
+; NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-NEXT:    eor v7.16b, v16.16b, v17.16b
+; NEON-NEXT:    eor v16.16b, v18.16b, v19.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    and v1.16b, v1.16b, v20.16b
+; NEON-NEXT:    mul v3.8h, v0.8h, v21.8h
+; NEON-NEXT:    mul v17.8h, v0.8h, v23.8h
+; NEON-NEXT:    eor v4.16b, v6.16b, v4.16b
+; NEON-NEXT:    eor v5.16b, v7.16b, v5.16b
+; NEON-NEXT:    eor v6.16b, v16.16b, v22.16b
+; NEON-NEXT:    mul v0.8h, v0.8h, v1.8h
+; NEON-NEXT:    eor v1.16b, v2.16b, v4.16b
+; NEON-NEXT:    eor v2.16b, v5.16b, v3.16b
+; NEON-NEXT:    eor v3.16b, v6.16b, v17.16b
+; NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
+; NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v8i16_neon:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    movi v2.8h, #2
+; NEON-AES-NEXT:    movi v3.8h, #1
+; NEON-AES-NEXT:    movi v4.8h, #4
+; NEON-AES-NEXT:    movi v5.8h, #8
+; NEON-AES-NEXT:    movi v6.8h, #16
+; NEON-AES-NEXT:    movi v7.8h, #32
+; NEON-AES-NEXT:    movi v16.8h, #128
+; NEON-AES-NEXT:    movi v17.8h, #1, lsl #8
+; NEON-AES-NEXT:    movi v18.8h, #8, lsl #8
+; NEON-AES-NEXT:    movi v19.8h, #16, lsl #8
+; NEON-AES-NEXT:    movi v20.8h, #64
+; NEON-AES-NEXT:    movi v21.8h, #2, lsl #8
+; NEON-AES-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-AES-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-AES-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-AES-NEXT:    movi v22.8h, #32, lsl #8
+; NEON-AES-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-AES-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-AES-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-AES-NEXT:    mul v2.8h, v0.8h, v2.8h
+; NEON-AES-NEXT:    mul v3.8h, v0.8h, v3.8h
+; NEON-AES-NEXT:    mul v4.8h, v0.8h, v4.8h
+; NEON-AES-NEXT:    mul v5.8h, v0.8h, v5.8h
+; NEON-AES-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    movi v23.8h, #4, lsl #8
+; NEON-AES-NEXT:    movi v24.8h, #64, lsl #8
+; NEON-AES-NEXT:    mul v6.8h, v0.8h, v6.8h
+; NEON-AES-NEXT:    mul v7.8h, v0.8h, v7.8h
+; NEON-AES-NEXT:    mul v16.8h, v0.8h, v16.8h
+; NEON-AES-NEXT:    mul v17.8h, v0.8h, v17.8h
+; NEON-AES-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-AES-NEXT:    mul v18.8h, v0.8h, v18.8h
+; NEON-AES-NEXT:    mul v19.8h, v0.8h, v19.8h
+; NEON-AES-NEXT:    and v22.16b, v1.16b, v22.16b
+; NEON-AES-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    mul v4.8h, v0.8h, v20.8h
+; NEON-AES-NEXT:    movi v20.8h, #128, lsl #8
+; NEON-AES-NEXT:    mul v5.8h, v0.8h, v21.8h
+; NEON-AES-NEXT:    and v21.16b, v1.16b, v23.16b
+; NEON-AES-NEXT:    and v23.16b, v1.16b, v24.16b
+; NEON-AES-NEXT:    mul v22.8h, v0.8h, v22.8h
+; NEON-AES-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-AES-NEXT:    eor v7.16b, v16.16b, v17.16b
+; NEON-AES-NEXT:    eor v16.16b, v18.16b, v19.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    and v1.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    mul v3.8h, v0.8h, v21.8h
+; NEON-AES-NEXT:    mul v17.8h, v0.8h, v23.8h
+; NEON-AES-NEXT:    eor v4.16b, v6.16b, v4.16b
+; NEON-AES-NEXT:    eor v5.16b, v7.16b, v5.16b
+; NEON-AES-NEXT:    eor v6.16b, v16.16b, v22.16b
+; NEON-AES-NEXT:    mul v0.8h, v0.8h, v1.8h
+; NEON-AES-NEXT:    eor v1.16b, v2.16b, v4.16b
+; NEON-AES-NEXT:    eor v2.16b, v5.16b, v3.16b
+; NEON-AES-NEXT:    eor v3.16b, v6.16b, v17.16b
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v2.16b
+; NEON-AES-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-AES-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-AES-NEXT:    ret
   %a = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %a
 }
 
 define <4 x i16> @clmul_v4i16_neon(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-LABEL: clmul_v4i16_neon:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4h, #2
-; CHECK-NEXT:    movi v3.4h, #1
-; CHECK-NEXT:    movi v4.4h, #4
-; CHECK-NEXT:    movi v5.4h, #8
-; CHECK-NEXT:    movi v6.4h, #16
-; CHECK-NEXT:    movi v7.4h, #32
-; CHECK-NEXT:    movi v16.4h, #128
-; CHECK-NEXT:    movi v17.4h, #1, lsl #8
-; CHECK-NEXT:    movi v18.4h, #8, lsl #8
-; CHECK-NEXT:    movi v19.4h, #16, lsl #8
-; CHECK-NEXT:    movi v20.4h, #64
-; CHECK-NEXT:    movi v21.4h, #2, lsl #8
-; CHECK-NEXT:    and v2.8b, v1.8b, v2.8b
-; CHECK-NEXT:    and v3.8b, v1.8b, v3.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    and v5.8b, v1.8b, v5.8b
-; CHECK-NEXT:    movi v22.4h, #32, lsl #8
-; CHECK-NEXT:    and v6.8b, v1.8b, v6.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v7.8b
-; CHECK-NEXT:    and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT:    and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT:    and v18.8b, v1.8b, v18.8b
-; CHECK-NEXT:    and v19.8b, v1.8b, v19.8b
-; CHECK-NEXT:    mul v2.4h, v0.4h, v2.4h
-; CHECK-NEXT:    mul v3.4h, v0.4h, v3.4h
-; CHECK-NEXT:    mul v4.4h, v0.4h, v4.4h
-; CHECK-NEXT:    mul v5.4h, v0.4h, v5.4h
-; CHECK-NEXT:    and v20.8b, v1.8b, v20.8b
-; CHECK-NEXT:    movi v23.4h, #4, lsl #8
-; CHECK-NEXT:    movi v24.4h, #64, lsl #8
-; CHECK-NEXT:    mul v6.4h, v0.4h, v6.4h
-; CHECK-NEXT:    mul v7.4h, v0.4h, v7.4h
-; CHECK-NEXT:    mul v16.4h, v0.4h, v16.4h
-; CHECK-NEXT:    mul v17.4h, v0.4h, v17.4h
-; CHECK-NEXT:    and v21.8b, v1.8b, v21.8b
-; CHECK-NEXT:    mul v18.4h, v0.4h, v18.4h
-; CHECK-NEXT:    mul v19.4h, v0.4h, v19.4h
-; CHECK-NEXT:    and v22.8b, v1.8b, v22.8b
-; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
-; CHECK-NEXT:    eor v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    mul v4.4h, v0.4h, v20.4h
-; CHECK-NEXT:    movi v20.4h, #128, lsl #8
-; CHECK-NEXT:    mul v5.4h, v0.4h, v21.4h
-; CHECK-NEXT:    and v21.8b, v1.8b, v23.8b
-; CHECK-NEXT:    and v23.8b, v1.8b, v24.8b
-; CHECK-NEXT:    mul v22.4h, v0.4h, v22.4h
-; CHECK-NEXT:    eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT:    eor v7.8b, v16.8b, v17.8b
-; CHECK-NEXT:    eor v16.8b, v18.8b, v19.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v20.8b
-; CHECK-NEXT:    mul v3.4h, v0.4h, v21.4h
-; CHECK-NEXT:    mul v17.4h, v0.4h, v23.4h
-; CHECK-NEXT:    eor v4.8b, v6.8b, v4.8b
-; CHECK-NEXT:    eor v5.8b, v7.8b, v5.8b
-; CHECK-NEXT:    eor v6.8b, v16.8b, v22.8b
-; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    eor v1.8b, v2.8b, v4.8b
-; CHECK-NEXT:    eor v2.8b, v5.8b, v3.8b
-; CHECK-NEXT:    eor v3.8b, v6.8b, v17.8b
-; CHECK-NEXT:    eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; NEON-LABEL: clmul_v4i16_neon:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v2.4h, #2
+; NEON-NEXT:    movi v3.4h, #1
+; NEON-NEXT:    movi v4.4h, #4
+; NEON-NEXT:    movi v5.4h, #8
+; NEON-NEXT:    movi v6.4h, #16
+; NEON-NEXT:    movi v7.4h, #32
+; NEON-NEXT:    movi v16.4h, #128
+; NEON-NEXT:    movi v17.4h, #1, lsl #8
+; NEON-NEXT:    movi v18.4h, #8, lsl #8
+; NEON-NEXT:    movi v19.4h, #16, lsl #8
+; NEON-NEXT:    movi v20.4h, #64
+; NEON-NEXT:    movi v21.4h, #2, lsl #8
+; NEON-NEXT:    and v2.8b, v1.8b, v2.8b
+; NEON-NEXT:    and v3.8b, v1.8b, v3.8b
+; NEON-NEXT:    and v4.8b, v1.8b, v4.8b
+; NEON-NEXT:    and v5.8b, v1.8b, v5.8b
+; NEON-NEXT:    movi v22.4h, #32, lsl #8
+; NEON-NEXT:    and v6.8b, v1.8b, v6.8b
+; NEON-NEXT:    and v7.8b, v1.8b, v7.8b
+; NEON-NEXT:    and v16.8b, v1.8b, v16.8b
+; NEON-NEXT:    and v17.8b, v1.8b, v17.8b
+; NEON-NEXT:    and v18.8b, v1.8b, v18.8b
+; NEON-NEXT:    and v19.8b, v1.8b, v19.8b
+; NEON-NEXT:    mul v2.4h, v0.4h, v2.4h
+; NEON-NEXT:    mul v3.4h, v0.4h, v3.4h
+; NEON-NEXT:    mul v4.4h, v0.4h, v4.4h
+; NEON-NEXT:    mul v5.4h, v0.4h, v5.4h
+; NEON-NEXT:    and v20.8b, v1.8b, v20.8b
+; NEON-NEXT:    movi v23.4h, #4, lsl #8
+; NEON-NEXT:    movi v24.4h, #64, lsl #8
+; NEON-NEXT:    mul v6.4h, v0.4h, v6.4h
+; NEON-NEXT:    mul v7.4h, v0.4h, v7.4h
+; NEON-NEXT:    mul v16.4h, v0.4h, v16.4h
+; NEON-NEXT:    mul v17.4h, v0.4h, v17.4h
+; NEON-NEXT:    and v21.8b, v1.8b, v21.8b
+; NEON-NEXT:    mul v18.4h, v0.4h, v18.4h
+; NEON-NEXT:    mul v19.4h, v0.4h, v19.4h
+; NEON-NEXT:    and v22.8b, v1.8b, v22.8b
+; NEON-NEXT:    eor v2.8b, v3.8b, v2.8b
+; NEON-NEXT:    eor v3.8b, v4.8b, v5.8b
+; NEON-NEXT:    mul v4.4h, v0.4h, v20.4h
+; NEON-NEXT:    movi v20.4h, #128, lsl #8
+; NEON-NEXT:    mul v5.4h, v0.4h, v21.4h
+; NEON-NEXT:    and v21.8b, v1.8b, v23.8b
+; NEON-NEXT:    and v23.8b, v1.8b, v24.8b
+; NEON-NEXT:    mul v22.4h, v0.4h, v22.4h
+; NEON-NEXT:    eor v6.8b, v6.8b, v7.8b
+; NEON-NEXT:    eor v7.8b, v16.8b, v17.8b
+; NEON-NEXT:    eor v16.8b, v18.8b, v19.8b
+; NEON-NEXT:    eor v2.8b, v2.8b, v3.8b
+; NEON-NEXT:    and v1.8b, v1.8b, v20.8b
+; NEON-NEXT:    mul v3.4h, v0.4h, v21.4h
+; NEON-NEXT:    mul v17.4h, v0.4h, v23.4h
+; NEON-NEXT:    eor v4.8b, v6.8b, v4.8b
+; NEON-NEXT:    eor v5.8b, v7.8b, v5.8b
+; NEON-NEXT:    eor v6.8b, v16.8b, v22.8b
+; NEON-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NEON-NEXT:    eor v1.8b, v2.8b, v4.8b
+; NEON-NEXT:    eor v2.8b, v5.8b, v3.8b
+; NEON-NEXT:    eor v3.8b, v6.8b, v17.8b
+; NEON-NEXT:    eor v1.8b, v1.8b, v2.8b
+; NEON-NEXT:    eor v0.8b, v3.8b, v0.8b
+; NEON-NEXT:    eor v0.8b, v1.8b, v0.8b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v4i16_neon:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    movi v2.4h, #2
+; NEON-AES-NEXT:    movi v3.4h, #1
+; NEON-AES-NEXT:    movi v4.4h, #4
+; NEON-AES-NEXT:    movi v5.4h, #8
+; NEON-AES-NEXT:    movi v6.4h, #16
+; NEON-AES-NEXT:    movi v7.4h, #32
+; NEON-AES-NEXT:    movi v16.4h, #128
+; NEON-AES-NEXT:    movi v17.4h, #1, lsl #8
+; NEON-AES-NEXT:    movi v18.4h, #8, lsl #8
+; NEON-AES-NEXT:    movi v19.4h, #16, lsl #8
+; NEON-AES-NEXT:    movi v20.4h, #64
+; NEON-AES-NEXT:    movi v21.4h, #2, lsl #8
+; NEON-AES-NEXT:    and v2.8b, v1.8b, v2.8b
+; NEON-AES-NEXT:    and v3.8b, v1.8b, v3.8b
+; NEON-AES-NEXT:    and v4.8b, v1.8b, v4.8b
+; NEON-AES-NEXT:    and v5.8b, v1.8b, v5.8b
+; NEON-AES-NEXT:    movi v22.4h, #32, lsl #8
+; NEON-AES-NEXT:    and v6.8b, v1.8b, v6.8b
+; NEON-AES-NEXT:    and v7.8b, v1.8b, v7.8b
+; NEON-AES-NEXT:    and v16.8b, v1.8b, v16.8b
+; NEON-AES-NEXT:    and v17.8b, v1.8b, v17.8b
+; NEON-AES-NEXT:    and v18.8b, v1.8b, v18.8b
+; NEON-AES-NEXT:    and v19.8b, v1.8b, v19.8b
+; NEON-AES-NEXT:    mul v2.4h, v0.4h, v2.4h
+; NEON-AES-NEXT:    mul v3.4h, v0.4h, v3.4h
+; NEON-AES-NEXT:    mul v4.4h, v0.4h, v4.4h
+; NEON-AES-NEXT:    mul v5.4h, v0.4h, v5.4h
+; NEON-AES-NEXT:    and v20.8b, v1.8b, v20.8b
+; NEON-AES-NEXT:    movi v23.4h, #4, lsl #8
+; NEON-AES-NEXT:    movi v24.4h, #64, lsl #8
+; NEON-AES-NEXT:    mul v6.4h, v0.4h, v6.4h
+; NEON-AES-NEXT:    mul v7.4h, v0.4h, v7.4h
+; NEON-AES-NEXT:    mul v16.4h, v0.4h, v16.4h
+; NEON-AES-NEXT:    mul v17.4h, v0.4h, v17.4h
+; NEON-AES-NEXT:    and v21.8b, v1.8b, v21.8b
+; NEON-AES-NEXT:    mul v18.4h, v0.4h, v18.4h
+; NEON-AES-NEXT:    mul v19.4h, v0.4h, v19.4h
+; NEON-AES-NEXT:    and v22.8b, v1.8b, v22.8b
+; NEON-AES-NEXT:    eor v2.8b, v3.8b, v2.8b
+; NEON-AES-NEXT:    eor v3.8b, v4.8b, v5.8b
+; NEON-AES-NEXT:    mul v4.4h, v0.4h, v20.4h
+; NEON-AES-NEXT:    movi v20.4h, #128, lsl #8
+; NEON-AES-NEXT:    mul v5.4h, v0.4h, v21.4h
+; NEON-AES-NEXT:    and v21.8b, v1.8b, v23.8b
+; NEON-AES-NEXT:    and v23.8b, v1.8b, v24.8b
+; NEON-AES-NEXT:    mul v22.4h, v0.4h, v22.4h
+; NEON-AES-NEXT:    eor v6.8b, v6.8b, v7.8b
+; NEON-AES-NEXT:    eor v7.8b, v16.8b, v17.8b
+; NEON-AES-NEXT:    eor v16.8b, v18.8b, v19.8b
+; NEON-AES-NEXT:    eor v2.8b, v2.8b, v3.8b
+; NEON-AES-NEXT:    and v1.8b, v1.8b, v20.8b
+; NEON-AES-NEXT:    mul v3.4h, v0.4h, v21.4h
+; NEON-AES-NEXT:    mul v17.4h, v0.4h, v23.4h
+; NEON-AES-NEXT:    eor v4.8b, v6.8b, v4.8b
+; NEON-AES-NEXT:    eor v5.8b, v7.8b, v5.8b
+; NEON-AES-NEXT:    eor v6.8b, v16.8b, v22.8b
+; NEON-AES-NEXT:    mul v0.4h, v0.4h, v1.4h
+; NEON-AES-NEXT:    eor v1.8b, v2.8b, v4.8b
+; NEON-AES-NEXT:    eor v2.8b, v5.8b, v3.8b
+; NEON-AES-NEXT:    eor v3.8b, v6.8b, v17.8b
+; NEON-AES-NEXT:    eor v1.8b, v1.8b, v2.8b
+; NEON-AES-NEXT:    eor v0.8b, v3.8b, v0.8b
+; NEON-AES-NEXT:    eor v0.8b, v1.8b, v0.8b
+; NEON-AES-NEXT:    ret
   %a = call <4 x i16> @llvm.clmul.v4i16(<4 x i16> %x, <4 x i16> %y)
   ret <4 x i16> %a
 }
 
 define <4 x i32> @clmul_v4i32_neon(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: clmul_v4i32_neon:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #2
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    movi v4.4s, #4
-; CHECK-NEXT:    movi v5.4s, #8
-; CHECK-NEXT:    movi v6.4s, #16
-; CHECK-NEXT:    movi v7.4s, #32
-; CHECK-NEXT:    movi v16.4s, #64
-; CHECK-NEXT:    movi v17.4s, #128
-; CHECK-NEXT:    movi v18.4s, #1, lsl #8
-; CHECK-NEXT:    movi v19.4s, #2, lsl #8
-; CHECK-NEXT:    movi v20.4s, #8, lsl #8
-; CHECK-NEXT:    movi v21.4s, #128, lsl #16
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v0.4s, v5.4s
-; CHECK-NEXT:    mul v6.4s, v0.4s, v6.4s
-; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    movi v22.4s, #8, lsl #16
-; CHECK-NEXT:    movi v23.4s, #2, lsl #24
-; CHECK-NEXT:    movi v25.4s, #4, lsl #24
-; CHECK-NEXT:    movi v24.4s, #32, lsl #16
-; CHECK-NEXT:    movi v26.4s, #8, lsl #24
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    movi v4.4s, #16, lsl #8
-; CHECK-NEXT:    mul v5.4s, v0.4s, v16.4s
-; CHECK-NEXT:    mul v16.4s, v0.4s, v17.4s
-; CHECK-NEXT:    mul v17.4s, v0.4s, v18.4s
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v19.16b
-; CHECK-NEXT:    movi v19.4s, #32, lsl #8
-; CHECK-NEXT:    and v18.16b, v1.16b, v20.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    movi v20.4s, #64, lsl #8
-; CHECK-NEXT:    mul v21.4s, v0.4s, v21.4s
-; CHECK-NEXT:    and v3.16b, v1.16b, v4.16b
-; CHECK-NEXT:    eor v5.16b, v6.16b, v5.16b
-; CHECK-NEXT:    movi v4.4s, #1, lsl #16
-; CHECK-NEXT:    eor v6.16b, v16.16b, v17.16b
-; CHECK-NEXT:    movi v16.4s, #2, lsl #16
-; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    mul v18.4s, v0.4s, v18.4s
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    movi v17.4s, #4, lsl #8
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v16.16b
-; CHECK-NEXT:    movi v16.4s, #64, lsl #16
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    mul v7.4s, v0.4s, v19.4s
-; CHECK-NEXT:    movi v19.4s, #4, lsl #16
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v18.16b, v3.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v20.16b
-; CHECK-NEXT:    movi v20.4s, #1, lsl #24
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v0.4s, v5.4s
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    mul v17.4s, v0.4s, v17.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v19.16b
-; CHECK-NEXT:    mul v18.4s, v0.4s, v18.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    movi v19.4s, #128, lsl #8
-; CHECK-NEXT:    mul v16.4s, v0.4s, v16.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    mul v5.4s, v0.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v1.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #16
-; CHECK-NEXT:    mul v20.4s, v0.4s, v20.4s
-; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v18.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v19.16b
-; CHECK-NEXT:    mul v18.4s, v0.4s, v23.4s
-; CHECK-NEXT:    and v19.16b, v1.16b, v25.16b
-; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT:    and v21.16b, v1.16b, v24.16b
-; CHECK-NEXT:    movi v23.4s, #32, lsl #24
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    mul v5.4s, v0.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v1.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #24
-; CHECK-NEXT:    movi v24.4s, #64, lsl #24
-; CHECK-NEXT:    mul v17.4s, v0.4s, v17.4s
-; CHECK-NEXT:    eor v16.16b, v16.16b, v20.16b
-; CHECK-NEXT:    and v20.16b, v1.16b, v26.16b
-; CHECK-NEXT:    mul v19.4s, v0.4s, v19.4s
-; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    mul v6.4s, v0.4s, v21.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEXT:    eor v5.16b, v16.16b, v18.16b
-; CHECK-NEXT:    movi v16.4s, #128, lsl #24
-; CHECK-NEXT:    mul v18.4s, v0.4s, v20.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v22.16b
-; CHECK-NEXT:    and v22.16b, v1.16b, v24.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v17.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v5.16b, v5.16b, v19.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
-; CHECK-NEXT:    mul v7.4s, v0.4s, v20.4s
-; CHECK-NEXT:    mul v16.4s, v0.4s, v21.4s
-; CHECK-NEXT:    mul v17.4s, v0.4s, v22.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v6.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v18.16b
-; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ret
+; NEON-LABEL: clmul_v4i32_neon:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v2.4s, #2
+; NEON-NEXT:    movi v3.4s, #1
+; NEON-NEXT:    movi v4.4s, #4
+; NEON-NEXT:    movi v5.4s, #8
+; NEON-NEXT:    movi v6.4s, #16
+; NEON-NEXT:    movi v7.4s, #32
+; NEON-NEXT:    movi v16.4s, #64
+; NEON-NEXT:    movi v17.4s, #128
+; NEON-NEXT:    movi v18.4s, #1, lsl #8
+; NEON-NEXT:    movi v19.4s, #2, lsl #8
+; NEON-NEXT:    movi v20.4s, #8, lsl #8
+; NEON-NEXT:    movi v21.4s, #128, lsl #16
+; NEON-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-NEXT:    mul v2.4s, v0.4s, v2.4s
+; NEON-NEXT:    mul v3.4s, v0.4s, v3.4s
+; NEON-NEXT:    mul v4.4s, v0.4s, v4.4s
+; NEON-NEXT:    mul v5.4s, v0.4s, v5.4s
+; NEON-NEXT:    mul v6.4s, v0.4s, v6.4s
+; NEON-NEXT:    mul v7.4s, v0.4s, v7.4s
+; NEON-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-NEXT:    movi v22.4s, #8, lsl #16
+; NEON-NEXT:    movi v23.4s, #2, lsl #24
+; NEON-NEXT:    movi v25.4s, #4, lsl #24
+; NEON-NEXT:    movi v24.4s, #32, lsl #16
+; NEON-NEXT:    movi v26.4s, #8, lsl #24
+; NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-NEXT:    movi v4.4s, #16, lsl #8
+; NEON-NEXT:    mul v5.4s, v0.4s, v16.4s
+; NEON-NEXT:    mul v16.4s, v0.4s, v17.4s
+; NEON-NEXT:    mul v17.4s, v0.4s, v18.4s
+; NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-NEXT:    and v7.16b, v1.16b, v19.16b
+; NEON-NEXT:    movi v19.4s, #32, lsl #8
+; NEON-NEXT:    and v18.16b, v1.16b, v20.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    movi v20.4s, #64, lsl #8
+; NEON-NEXT:    mul v21.4s, v0.4s, v21.4s
+; NEON-NEXT:    and v3.16b, v1.16b, v4.16b
+; NEON-NEXT:    eor v5.16b, v6.16b, v5.16b
+; NEON-NEXT:    movi v4.4s, #1, lsl #16
+; NEON-NEXT:    eor v6.16b, v16.16b, v17.16b
+; NEON-NEXT:    movi v16.4s, #2, lsl #16
+; NEON-NEXT:    mul v7.4s, v0.4s, v7.4s
+; NEON-NEXT:    mul v18.4s, v0.4s, v18.4s
+; NEON-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-NEXT:    movi v17.4s, #4, lsl #8
+; NEON-NEXT:    mul v3.4s, v0.4s, v3.4s
+; NEON-NEXT:    eor v2.16b, v2.16b, v5.16b
+; NEON-NEXT:    and v23.16b, v1.16b, v23.16b
+; NEON-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-NEXT:    and v5.16b, v1.16b, v16.16b
+; NEON-NEXT:    movi v16.4s, #64, lsl #16
+; NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-NEXT:    mul v7.4s, v0.4s, v19.4s
+; NEON-NEXT:    movi v19.4s, #4, lsl #16
+; NEON-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-NEXT:    eor v3.16b, v18.16b, v3.16b
+; NEON-NEXT:    and v18.16b, v1.16b, v20.16b
+; NEON-NEXT:    movi v20.4s, #1, lsl #24
+; NEON-NEXT:    mul v4.4s, v0.4s, v4.4s
+; NEON-NEXT:    mul v5.4s, v0.4s, v5.4s
+; NEON-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-NEXT:    mul v17.4s, v0.4s, v17.4s
+; NEON-NEXT:    eor v3.16b, v3.16b, v7.16b
+; NEON-NEXT:    and v7.16b, v1.16b, v19.16b
+; NEON-NEXT:    mul v18.4s, v0.4s, v18.4s
+; NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-NEXT:    movi v19.4s, #128, lsl #8
+; NEON-NEXT:    mul v16.4s, v0.4s, v16.4s
+; NEON-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-NEXT:    mul v5.4s, v0.4s, v7.4s
+; NEON-NEXT:    and v7.16b, v1.16b, v22.16b
+; NEON-NEXT:    movi v22.4s, #16, lsl #16
+; NEON-NEXT:    mul v20.4s, v0.4s, v20.4s
+; NEON-NEXT:    eor v6.16b, v6.16b, v17.16b
+; NEON-NEXT:    eor v3.16b, v3.16b, v18.16b
+; NEON-NEXT:    and v17.16b, v1.16b, v19.16b
+; NEON-NEXT:    mul v18.4s, v0.4s, v23.4s
+; NEON-NEXT:    and v19.16b, v1.16b, v25.16b
+; NEON-NEXT:    eor v16.16b, v16.16b, v21.16b
+; NEON-NEXT:    and v21.16b, v1.16b, v24.16b
+; NEON-NEXT:    movi v23.4s, #32, lsl #24
+; NEON-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-NEXT:    mul v5.4s, v0.4s, v7.4s
+; NEON-NEXT:    and v7.16b, v1.16b, v22.16b
+; NEON-NEXT:    movi v22.4s, #16, lsl #24
+; NEON-NEXT:    movi v24.4s, #64, lsl #24
+; NEON-NEXT:    mul v17.4s, v0.4s, v17.4s
+; NEON-NEXT:    eor v16.16b, v16.16b, v20.16b
+; NEON-NEXT:    and v20.16b, v1.16b, v26.16b
+; NEON-NEXT:    mul v19.4s, v0.4s, v19.4s
+; NEON-NEXT:    mul v7.4s, v0.4s, v7.4s
+; NEON-NEXT:    eor v2.16b, v2.16b, v6.16b
+; NEON-NEXT:    mul v6.4s, v0.4s, v21.4s
+; NEON-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-NEXT:    and v21.16b, v1.16b, v23.16b
+; NEON-NEXT:    eor v5.16b, v16.16b, v18.16b
+; NEON-NEXT:    movi v16.4s, #128, lsl #24
+; NEON-NEXT:    mul v18.4s, v0.4s, v20.4s
+; NEON-NEXT:    and v20.16b, v1.16b, v22.16b
+; NEON-NEXT:    and v22.16b, v1.16b, v24.16b
+; NEON-NEXT:    eor v3.16b, v3.16b, v17.16b
+; NEON-NEXT:    eor v4.16b, v4.16b, v7.16b
+; NEON-NEXT:    eor v5.16b, v5.16b, v19.16b
+; NEON-NEXT:    and v1.16b, v1.16b, v16.16b
+; NEON-NEXT:    mul v7.4s, v0.4s, v20.4s
+; NEON-NEXT:    mul v16.4s, v0.4s, v21.4s
+; NEON-NEXT:    mul v17.4s, v0.4s, v22.4s
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v3.16b, v4.16b, v6.16b
+; NEON-NEXT:    eor v4.16b, v5.16b, v18.16b
+; NEON-NEXT:    mul v0.4s, v0.4s, v1.4s
+; NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v2.16b, v4.16b, v7.16b
+; NEON-NEXT:    eor v3.16b, v16.16b, v17.16b
+; NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
+; NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v4i32_neon:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    movi v2.4s, #2
+; NEON-AES-NEXT:    movi v3.4s, #1
+; NEON-AES-NEXT:    movi v4.4s, #4
+; NEON-AES-NEXT:    movi v5.4s, #8
+; NEON-AES-NEXT:    movi v6.4s, #16
+; NEON-AES-NEXT:    movi v7.4s, #32
+; NEON-AES-NEXT:    movi v16.4s, #64
+; NEON-AES-NEXT:    movi v17.4s, #128
+; NEON-AES-NEXT:    movi v18.4s, #1, lsl #8
+; NEON-AES-NEXT:    movi v19.4s, #2, lsl #8
+; NEON-AES-NEXT:    movi v20.4s, #8, lsl #8
+; NEON-AES-NEXT:    movi v21.4s, #128, lsl #16
+; NEON-AES-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-AES-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-AES-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-AES-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-AES-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-AES-NEXT:    mul v2.4s, v0.4s, v2.4s
+; NEON-AES-NEXT:    mul v3.4s, v0.4s, v3.4s
+; NEON-AES-NEXT:    mul v4.4s, v0.4s, v4.4s
+; NEON-AES-NEXT:    mul v5.4s, v0.4s, v5.4s
+; NEON-AES-NEXT:    mul v6.4s, v0.4s, v6.4s
+; NEON-AES-NEXT:    mul v7.4s, v0.4s, v7.4s
+; NEON-AES-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-AES-NEXT:    movi v22.4s, #8, lsl #16
+; NEON-AES-NEXT:    movi v23.4s, #2, lsl #24
+; NEON-AES-NEXT:    movi v25.4s, #4, lsl #24
+; NEON-AES-NEXT:    movi v24.4s, #32, lsl #16
+; NEON-AES-NEXT:    movi v26.4s, #8, lsl #24
+; NEON-AES-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    movi v4.4s, #16, lsl #8
+; NEON-AES-NEXT:    mul v5.4s, v0.4s, v16.4s
+; NEON-AES-NEXT:    mul v16.4s, v0.4s, v17.4s
+; NEON-AES-NEXT:    mul v17.4s, v0.4s, v18.4s
+; NEON-AES-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v19.16b
+; NEON-AES-NEXT:    movi v19.4s, #32, lsl #8
+; NEON-AES-NEXT:    and v18.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    movi v20.4s, #64, lsl #8
+; NEON-AES-NEXT:    mul v21.4s, v0.4s, v21.4s
+; NEON-AES-NEXT:    and v3.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    eor v5.16b, v6.16b, v5.16b
+; NEON-AES-NEXT:    movi v4.4s, #1, lsl #16
+; NEON-AES-NEXT:    eor v6.16b, v16.16b, v17.16b
+; NEON-AES-NEXT:    movi v16.4s, #2, lsl #16
+; NEON-AES-NEXT:    mul v7.4s, v0.4s, v7.4s
+; NEON-AES-NEXT:    mul v18.4s, v0.4s, v18.4s
+; NEON-AES-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-AES-NEXT:    movi v17.4s, #4, lsl #8
+; NEON-AES-NEXT:    mul v3.4s, v0.4s, v3.4s
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v5.16b
+; NEON-AES-NEXT:    and v23.16b, v1.16b, v23.16b
+; NEON-AES-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    and v5.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    movi v16.4s, #64, lsl #16
+; NEON-AES-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-AES-NEXT:    mul v7.4s, v0.4s, v19.4s
+; NEON-AES-NEXT:    movi v19.4s, #4, lsl #16
+; NEON-AES-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    eor v3.16b, v18.16b, v3.16b
+; NEON-AES-NEXT:    and v18.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    movi v20.4s, #1, lsl #24
+; NEON-AES-NEXT:    mul v4.4s, v0.4s, v4.4s
+; NEON-AES-NEXT:    mul v5.4s, v0.4s, v5.4s
+; NEON-AES-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    mul v17.4s, v0.4s, v17.4s
+; NEON-AES-NEXT:    eor v3.16b, v3.16b, v7.16b
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v19.16b
+; NEON-AES-NEXT:    mul v18.4s, v0.4s, v18.4s
+; NEON-AES-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    movi v19.4s, #128, lsl #8
+; NEON-AES-NEXT:    mul v16.4s, v0.4s, v16.4s
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    mul v5.4s, v0.4s, v7.4s
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v22.16b
+; NEON-AES-NEXT:    movi v22.4s, #16, lsl #16
+; NEON-AES-NEXT:    mul v20.4s, v0.4s, v20.4s
+; NEON-AES-NEXT:    eor v6.16b, v6.16b, v17.16b
+; NEON-AES-NEXT:    eor v3.16b, v3.16b, v18.16b
+; NEON-AES-NEXT:    and v17.16b, v1.16b, v19.16b
+; NEON-AES-NEXT:    mul v18.4s, v0.4s, v23.4s
+; NEON-AES-NEXT:    and v19.16b, v1.16b, v25.16b
+; NEON-AES-NEXT:    eor v16.16b, v16.16b, v21.16b
+; NEON-AES-NEXT:    and v21.16b, v1.16b, v24.16b
+; NEON-AES-NEXT:    movi v23.4s, #32, lsl #24
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    mul v5.4s, v0.4s, v7.4s
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v22.16b
+; NEON-AES-NEXT:    movi v22.4s, #16, lsl #24
+; NEON-AES-NEXT:    movi v24.4s, #64, lsl #24
+; NEON-AES-NEXT:    mul v17.4s, v0.4s, v17.4s
+; NEON-AES-NEXT:    eor v16.16b, v16.16b, v20.16b
+; NEON-AES-NEXT:    and v20.16b, v1.16b, v26.16b
+; NEON-AES-NEXT:    mul v19.4s, v0.4s, v19.4s
+; NEON-AES-NEXT:    mul v7.4s, v0.4s, v7.4s
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v6.16b
+; NEON-AES-NEXT:    mul v6.4s, v0.4s, v21.4s
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    and v21.16b, v1.16b, v23.16b
+; NEON-AES-NEXT:    eor v5.16b, v16.16b, v18.16b
+; NEON-AES-NEXT:    movi v16.4s, #128, lsl #24
+; NEON-AES-NEXT:    mul v18.4s, v0.4s, v20.4s
+; NEON-AES-NEXT:    and v20.16b, v1.16b, v22.16b
+; NEON-AES-NEXT:    and v22.16b, v1.16b, v24.16b
+; NEON-AES-NEXT:    eor v3.16b, v3.16b, v17.16b
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v7.16b
+; NEON-AES-NEXT:    eor v5.16b, v5.16b, v19.16b
+; NEON-AES-NEXT:    and v1.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    mul v7.4s, v0.4s, v20.4s
+; NEON-AES-NEXT:    mul v16.4s, v0.4s, v21.4s
+; NEON-AES-NEXT:    mul v17.4s, v0.4s, v22.4s
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v6.16b
+; NEON-AES-NEXT:    eor v4.16b, v5.16b, v18.16b
+; NEON-AES-NEXT:    mul v0.4s, v0.4s, v1.4s
+; NEON-AES-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v2.16b, v4.16b, v7.16b
+; NEON-AES-NEXT:    eor v3.16b, v16.16b, v17.16b
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v2.16b
+; NEON-AES-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-AES-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-AES-NEXT:    ret
   %a = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %a
 }
 
 define <2 x i32> @clmul_v2i32_neon(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: clmul_v2i32_neon:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.2s, #2
-; CHECK-NEXT:    movi v3.2s, #1
-; CHECK-NEXT:    movi v4.2s, #4
-; CHECK-NEXT:    movi v5.2s, #8
-; CHECK-NEXT:    movi v6.2s, #16
-; CHECK-NEXT:    movi v7.2s, #32
-; CHECK-NEXT:    movi v16.2s, #64
-; CHECK-NEXT:    movi v17.2s, #128
-; CHECK-NEXT:    movi v18.2s, #1, lsl #8
-; CHECK-NEXT:    movi v19.2s, #2, lsl #8
-; CHECK-NEXT:    movi v20.2s, #8, lsl #8
-; CHECK-NEXT:    movi v21.2s, #128, lsl #16
-; CHECK-NEXT:    and v2.8b, v1.8b, v2.8b
-; CHECK-NEXT:    and v3.8b, v1.8b, v3.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    and v5.8b, v1.8b, v5.8b
-; CHECK-NEXT:    and v6.8b, v1.8b, v6.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v7.8b
-; CHECK-NEXT:    and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT:    and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT:    and v18.8b, v1.8b, v18.8b
-; CHECK-NEXT:    mul v2.2s, v0.2s, v2.2s
-; CHECK-NEXT:    mul v3.2s, v0.2s, v3.2s
-; CHECK-NEXT:    mul v4.2s, v0.2s, v4.2s
-; CHECK-NEXT:    mul v5.2s, v0.2s, v5.2s
-; CHECK-NEXT:    mul v6.2s, v0.2s, v6.2s
-; CHECK-NEXT:    mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT:    and v21.8b, v1.8b, v21.8b
-; CHECK-NEXT:    movi v22.2s, #8, lsl #16
-; CHECK-NEXT:    movi v23.2s, #2, lsl #24
-; CHECK-NEXT:    movi v25.2s, #4, lsl #24
-; CHECK-NEXT:    movi v24.2s, #32, lsl #16
-; CHECK-NEXT:    movi v26.2s, #8, lsl #24
-; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
-; CHECK-NEXT:    eor v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    movi v4.2s, #16, lsl #8
-; CHECK-NEXT:    mul v5.2s, v0.2s, v16.2s
-; CHECK-NEXT:    mul v16.2s, v0.2s, v17.2s
-; CHECK-NEXT:    mul v17.2s, v0.2s, v18.2s
-; CHECK-NEXT:    eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v19.8b
-; CHECK-NEXT:    movi v19.2s, #32, lsl #8
-; CHECK-NEXT:    and v18.8b, v1.8b, v20.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    movi v20.2s, #64, lsl #8
-; CHECK-NEXT:    mul v21.2s, v0.2s, v21.2s
-; CHECK-NEXT:    and v3.8b, v1.8b, v4.8b
-; CHECK-NEXT:    eor v5.8b, v6.8b, v5.8b
-; CHECK-NEXT:    movi v4.2s, #1, lsl #16
-; CHECK-NEXT:    eor v6.8b, v16.8b, v17.8b
-; CHECK-NEXT:    movi v16.2s, #2, lsl #16
-; CHECK-NEXT:    mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT:    mul v18.2s, v0.2s, v18.2s
-; CHECK-NEXT:    and v19.8b, v1.8b, v19.8b
-; CHECK-NEXT:    movi v17.2s, #4, lsl #8
-; CHECK-NEXT:    mul v3.2s, v0.2s, v3.2s
-; CHECK-NEXT:    eor v2.8b, v2.8b, v5.8b
-; CHECK-NEXT:    and v23.8b, v1.8b, v23.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    and v5.8b, v1.8b, v16.8b
-; CHECK-NEXT:    movi v16.2s, #64, lsl #16
-; CHECK-NEXT:    eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT:    mul v7.2s, v0.2s, v19.2s
-; CHECK-NEXT:    movi v19.2s, #4, lsl #16
-; CHECK-NEXT:    and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT:    eor v3.8b, v18.8b, v3.8b
-; CHECK-NEXT:    and v18.8b, v1.8b, v20.8b
-; CHECK-NEXT:    movi v20.2s, #1, lsl #24
-; CHECK-NEXT:    mul v4.2s, v0.2s, v4.2s
-; CHECK-NEXT:    mul v5.2s, v0.2s, v5.2s
-; CHECK-NEXT:    and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT:    mul v17.2s, v0.2s, v17.2s
-; CHECK-NEXT:    eor v3.8b, v3.8b, v7.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v19.8b
-; CHECK-NEXT:    mul v18.2s, v0.2s, v18.2s
-; CHECK-NEXT:    and v20.8b, v1.8b, v20.8b
-; CHECK-NEXT:    movi v19.2s, #128, lsl #8
-; CHECK-NEXT:    mul v16.2s, v0.2s, v16.2s
-; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT:    mul v5.2s, v0.2s, v7.2s
-; CHECK-NEXT:    and v7.8b, v1.8b, v22.8b
-; CHECK-NEXT:    movi v22.2s, #16, lsl #16
-; CHECK-NEXT:    mul v20.2s, v0.2s, v20.2s
-; CHECK-NEXT:    eor v6.8b, v6.8b, v17.8b
-; CHECK-NEXT:    eor v3.8b, v3.8b, v18.8b
-; CHECK-NEXT:    and v17.8b, v1.8b, v19.8b
-; CHECK-NEXT:    mul v18.2s, v0.2s, v23.2s
-; CHECK-NEXT:    and v19.8b, v1.8b, v25.8b
-; CHECK-NEXT:    eor v16.8b, v16.8b, v21.8b
-; CHECK-NEXT:    and v21.8b, v1.8b, v24.8b
-; CHECK-NEXT:    movi v23.2s, #32, lsl #24
-; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT:    mul v5.2s, v0.2s, v7.2s
-; CHECK-NEXT:    and v7.8b, v1.8b, v22.8b
-; CHECK-NEXT:    movi v22.2s, #16, lsl #24
-; CHECK-NEXT:    movi v24.2s, #64, lsl #24
-; CHECK-NEXT:    mul v17.2s, v0.2s, v17.2s
-; CHECK-NEXT:    eor v16.8b, v16.8b, v20.8b
-; CHECK-NEXT:    and v20.8b, v1.8b, v26.8b
-; CHECK-NEXT:    mul v19.2s, v0.2s, v19.2s
-; CHECK-NEXT:    mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT:    eor v2.8b, v2.8b, v6.8b
-; CHECK-NEXT:    mul v6.2s, v0.2s, v21.2s
-; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v21.8b, v1.8b, v23.8b
-; CHECK-NEXT:    eor v5.8b, v16.8b, v18.8b
-; CHECK-NEXT:    movi v16.2s, #128, lsl #24
-; CHECK-NEXT:    mul v18.2s, v0.2s, v20.2s
-; CHECK-NEXT:    and v20.8b, v1.8b, v22.8b
-; CHECK-NEXT:    and v22.8b, v1.8b, v24.8b
-; CHECK-NEXT:    eor v3.8b, v3.8b, v17.8b
-; CHECK-NEXT:    eor v4.8b, v4.8b, v7.8b
-; CHECK-NEXT:    eor v5.8b, v5.8b, v19.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v16.8b
-; CHECK-NEXT:    mul v7.2s, v0.2s, v20.2s
-; CHECK-NEXT:    mul v16.2s, v0.2s, v21.2s
-; CHECK-NEXT:    mul v17.2s, v0.2s, v22.2s
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    eor v3.8b, v4.8b, v6.8b
-; CHECK-NEXT:    eor v4.8b, v5.8b, v18.8b
-; CHECK-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    eor v1.8b, v2.8b, v3.8b
-; CHECK-NEXT:    eor v2.8b, v4.8b, v7.8b
-; CHECK-NEXT:    eor v3.8b, v16.8b, v17.8b
-; CHECK-NEXT:    eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; NEON-LABEL: clmul_v2i32_neon:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v2.2s, #2
+; NEON-NEXT:    movi v3.2s, #1
+; NEON-NEXT:    movi v4.2s, #4
+; NEON-NEXT:    movi v5.2s, #8
+; NEON-NEXT:    movi v6.2s, #16
+; NEON-NEXT:    movi v7.2s, #32
+; NEON-NEXT:    movi v16.2s, #64
+; NEON-NEXT:    movi v17.2s, #128
+; NEON-NEXT:    movi v18.2s, #1, lsl #8
+; NEON-NEXT:    movi v19.2s, #2, lsl #8
+; NEON-NEXT:    movi v20.2s, #8, lsl #8
+; NEON-NEXT:    movi v21.2s, #128, lsl #16
+; NEON-NEXT:    and v2.8b, v1.8b, v2.8b
+; NEON-NEXT:    and v3.8b, v1.8b, v3.8b
+; NEON-NEXT:    and v4.8b, v1.8b, v4.8b
+; NEON-NEXT:    and v5.8b, v1.8b, v5.8b
+; NEON-NEXT:    and v6.8b, v1.8b, v6.8b
+; NEON-NEXT:    and v7.8b, v1.8b, v7.8b
+; NEON-NEXT:    and v16.8b, v1.8b, v16.8b
+; NEON-NEXT:    and v17.8b, v1.8b, v17.8b
+; NEON-NEXT:    and v18.8b, v1.8b, v18.8b
+; NEON-NEXT:    mul v2.2s, v0.2s, v2.2s
+; NEON-NEXT:    mul v3.2s, v0.2s, v3.2s
+; NEON-NEXT:    mul v4.2s, v0.2s, v4.2s
+; NEON-NEXT:    mul v5.2s, v0.2s, v5.2s
+; NEON-NEXT:    mul v6.2s, v0.2s, v6.2s
+; NEON-NEXT:    mul v7.2s, v0.2s, v7.2s
+; NEON-NEXT:    and v21.8b, v1.8b, v21.8b
+; NEON-NEXT:    movi v22.2s, #8, lsl #16
+; NEON-NEXT:    movi v23.2s, #2, lsl #24
+; NEON-NEXT:    movi v25.2s, #4, lsl #24
+; NEON-NEXT:    movi v24.2s, #32, lsl #16
+; NEON-NEXT:    movi v26.2s, #8, lsl #24
+; NEON-NEXT:    eor v2.8b, v3.8b, v2.8b
+; NEON-NEXT:    eor v3.8b, v4.8b, v5.8b
+; NEON-NEXT:    movi v4.2s, #16, lsl #8
+; NEON-NEXT:    mul v5.2s, v0.2s, v16.2s
+; NEON-NEXT:    mul v16.2s, v0.2s, v17.2s
+; NEON-NEXT:    mul v17.2s, v0.2s, v18.2s
+; NEON-NEXT:    eor v6.8b, v6.8b, v7.8b
+; NEON-NEXT:    and v7.8b, v1.8b, v19.8b
+; NEON-NEXT:    movi v19.2s, #32, lsl #8
+; NEON-NEXT:    and v18.8b, v1.8b, v20.8b
+; NEON-NEXT:    eor v2.8b, v2.8b, v3.8b
+; NEON-NEXT:    movi v20.2s, #64, lsl #8
+; NEON-NEXT:    mul v21.2s, v0.2s, v21.2s
+; NEON-NEXT:    and v3.8b, v1.8b, v4.8b
+; NEON-NEXT:    eor v5.8b, v6.8b, v5.8b
+; NEON-NEXT:    movi v4.2s, #1, lsl #16
+; NEON-NEXT:    eor v6.8b, v16.8b, v17.8b
+; NEON-NEXT:    movi v16.2s, #2, lsl #16
+; NEON-NEXT:    mul v7.2s, v0.2s, v7.2s
+; NEON-NEXT:    mul v18.2s, v0.2s, v18.2s
+; NEON-NEXT:    and v19.8b, v1.8b, v19.8b
+; NEON-NEXT:    movi v17.2s, #4, lsl #8
+; NEON-NEXT:    mul v3.2s, v0.2s, v3.2s
+; NEON-NEXT:    eor v2.8b, v2.8b, v5.8b
+; NEON-NEXT:    and v23.8b, v1.8b, v23.8b
+; NEON-NEXT:    and v4.8b, v1.8b, v4.8b
+; NEON-NEXT:    and v5.8b, v1.8b, v16.8b
+; NEON-NEXT:    movi v16.2s, #64, lsl #16
+; NEON-NEXT:    eor v6.8b, v6.8b, v7.8b
+; NEON-NEXT:    mul v7.2s, v0.2s, v19.2s
+; NEON-NEXT:    movi v19.2s, #4, lsl #16
+; NEON-NEXT:    and v17.8b, v1.8b, v17.8b
+; NEON-NEXT:    eor v3.8b, v18.8b, v3.8b
+; NEON-NEXT:    and v18.8b, v1.8b, v20.8b
+; NEON-NEXT:    movi v20.2s, #1, lsl #24
+; NEON-NEXT:    mul v4.2s, v0.2s, v4.2s
+; NEON-NEXT:    mul v5.2s, v0.2s, v5.2s
+; NEON-NEXT:    and v16.8b, v1.8b, v16.8b
+; NEON-NEXT:    mul v17.2s, v0.2s, v17.2s
+; NEON-NEXT:    eor v3.8b, v3.8b, v7.8b
+; NEON-NEXT:    and v7.8b, v1.8b, v19.8b
+; NEON-NEXT:    mul v18.2s, v0.2s, v18.2s
+; NEON-NEXT:    and v20.8b, v1.8b, v20.8b
+; NEON-NEXT:    movi v19.2s, #128, lsl #8
+; NEON-NEXT:    mul v16.2s, v0.2s, v16.2s
+; NEON-NEXT:    eor v4.8b, v4.8b, v5.8b
+; NEON-NEXT:    mul v5.2s, v0.2s, v7.2s
+; NEON-NEXT:    and v7.8b, v1.8b, v22.8b
+; NEON-NEXT:    movi v22.2s, #16, lsl #16
+; NEON-NEXT:    mul v20.2s, v0.2s, v20.2s
+; NEON-NEXT:    eor v6.8b, v6.8b, v17.8b
+; NEON-NEXT:    eor v3.8b, v3.8b, v18.8b
+; NEON-NEXT:    and v17.8b, v1.8b, v19.8b
+; NEON-NEXT:    mul v18.2s, v0.2s, v23.2s
+; NEON-NEXT:    and v19.8b, v1.8b, v25.8b
+; NEON-NEXT:    eor v16.8b, v16.8b, v21.8b
+; NEON-NEXT:    and v21.8b, v1.8b, v24.8b
+; NEON-NEXT:    movi v23.2s, #32, lsl #24
+; NEON-NEXT:    eor v4.8b, v4.8b, v5.8b
+; NEON-NEXT:    mul v5.2s, v0.2s, v7.2s
+; NEON-NEXT:    and v7.8b, v1.8b, v22.8b
+; NEON-NEXT:    movi v22.2s, #16, lsl #24
+; NEON-NEXT:    movi v24.2s, #64, lsl #24
+; NEON-NEXT:    mul v17.2s, v0.2s, v17.2s
+; NEON-NEXT:    eor v16.8b, v16.8b, v20.8b
+; NEON-NEXT:    and v20.8b, v1.8b, v26.8b
+; NEON-NEXT:    mul v19.2s, v0.2s, v19.2s
+; NEON-NEXT:    mul v7.2s, v0.2s, v7.2s
+; NEON-NEXT:    eor v2.8b, v2.8b, v6.8b
+; NEON-NEXT:    mul v6.2s, v0.2s, v21.2s
+; NEON-NEXT:    eor v4.8b, v4.8b, v5.8b
+; NEON-NEXT:    and v21.8b, v1.8b, v23.8b
+; NEON-NEXT:    eor v5.8b, v16.8b, v18.8b
+; NEON-NEXT:    movi v16.2s, #128, lsl #24
+; NEON-NEXT:    mul v18.2s, v0.2s, v20.2s
+; NEON-NEXT:    and v20.8b, v1.8b, v22.8b
+; NEON-NEXT:    and v22.8b, v1.8b, v24.8b
+; NEON-NEXT:    eor v3.8b, v3.8b, v17.8b
+; NEON-NEXT:    eor v4.8b, v4.8b, v7.8b
+; NEON-NEXT:    eor v5.8b, v5.8b, v19.8b
+; NEON-NEXT:    and v1.8b, v1.8b, v16.8b
+; NEON-NEXT:    mul v7.2s, v0.2s, v20.2s
+; NEON-NEXT:    mul v16.2s, v0.2s, v21.2s
+; NEON-NEXT:    mul v17.2s, v0.2s, v22.2s
+; NEON-NEXT:    eor v2.8b, v2.8b, v3.8b
+; NEON-NEXT:    eor v3.8b, v4.8b, v6.8b
+; NEON-NEXT:    eor v4.8b, v5.8b, v18.8b
+; NEON-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NEON-NEXT:    eor v1.8b, v2.8b, v3.8b
+; NEON-NEXT:    eor v2.8b, v4.8b, v7.8b
+; NEON-NEXT:    eor v3.8b, v16.8b, v17.8b
+; NEON-NEXT:    eor v1.8b, v1.8b, v2.8b
+; NEON-NEXT:    eor v0.8b, v3.8b, v0.8b
+; NEON-NEXT:    eor v0.8b, v1.8b, v0.8b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v2i32_neon:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    movi v2.2s, #2
+; NEON-AES-NEXT:    movi v3.2s, #1
+; NEON-AES-NEXT:    movi v4.2s, #4
+; NEON-AES-NEXT:    movi v5.2s, #8
+; NEON-AES-NEXT:    movi v6.2s, #16
+; NEON-AES-NEXT:    movi v7.2s, #32
+; NEON-AES-NEXT:    movi v16.2s, #64
+; NEON-AES-NEXT:    movi v17.2s, #128
+; NEON-AES-NEXT:    movi v18.2s, #1, lsl #8
+; NEON-AES-NEXT:    movi v19.2s, #2, lsl #8
+; NEON-AES-NEXT:    movi v20.2s, #8, lsl #8
+; NEON-AES-NEXT:    movi v21.2s, #128, lsl #16
+; NEON-AES-NEXT:    and v2.8b, v1.8b, v2.8b
+; NEON-AES-NEXT:    and v3.8b, v1.8b, v3.8b
+; NEON-AES-NEXT:    and v4.8b, v1.8b, v4.8b
+; NEON-AES-NEXT:    and v5.8b, v1.8b, v5.8b
+; NEON-AES-NEXT:    and v6.8b, v1.8b, v6.8b
+; NEON-AES-NEXT:    and v7.8b, v1.8b, v7.8b
+; NEON-AES-NEXT:    and v16.8b, v1.8b, v16.8b
+; NEON-AES-NEXT:    and v17.8b, v1.8b, v17.8b
+; NEON-AES-NEXT:    and v18.8b, v1.8b, v18.8b
+; NEON-AES-NEXT:    mul v2.2s, v0.2s, v2.2s
+; NEON-AES-NEXT:    mul v3.2s, v0.2s, v3.2s
+; NEON-AES-NEXT:    mul v4.2s, v0.2s, v4.2s
+; NEON-AES-NEXT:    mul v5.2s, v0.2s, v5.2s
+; NEON-AES-NEXT:    mul v6.2s, v0.2s, v6.2s
+; NEON-AES-NEXT:    mul v7.2s, v0.2s, v7.2s
+; NEON-AES-NEXT:    and v21.8b, v1.8b, v21.8b
+; NEON-AES-NEXT:    movi v22.2s, #8, lsl #16
+; NEON-AES-NEXT:    movi v23.2s, #2, lsl #24
+; NEON-AES-NEXT:    movi v25.2s, #4, lsl #24
+; NEON-AES-NEXT:    movi v24.2s, #32, lsl #16
+; NEON-AES-NEXT:    movi v26.2s, #8, lsl #24
+; NEON-AES-NEXT:    eor v2.8b, v3.8b, v2.8b
+; NEON-AES-NEXT:    eor v3.8b, v4.8b, v5.8b
+; NEON-AES-NEXT:    movi v4.2s, #16, lsl #8
+; NEON-AES-NEXT:    mul v5.2s, v0.2s, v16.2s
+; NEON-AES-NEXT:    mul v16.2s, v0.2s, v17.2s
+; NEON-AES-NEXT:    mul v17.2s, v0.2s, v18.2s
+; NEON-AES-NEXT:    eor v6.8b, v6.8b, v7.8b
+; NEON-AES-NEXT:    and v7.8b, v1.8b, v19.8b
+; NEON-AES-NEXT:    movi v19.2s, #32, lsl #8
+; NEON-AES-NEXT:    and v18.8b, v1.8b, v20.8b
+; NEON-AES-NEXT:    eor v2.8b, v2.8b, v3.8b
+; NEON-AES-NEXT:    movi v20.2s, #64, lsl #8
+; NEON-AES-NEXT:    mul v21.2s, v0.2s, v21.2s
+; NEON-AES-NEXT:    and v3.8b, v1.8b, v4.8b
+; NEON-AES-NEXT:    eor v5.8b, v6.8b, v5.8b
+; NEON-AES-NEXT:    movi v4.2s, #1, lsl #16
+; NEON-AES-NEXT:    eor v6.8b, v16.8b, v17.8b
+; NEON-AES-NEXT:    movi v16.2s, #2, lsl #16
+; NEON-AES-NEXT:    mul v7.2s, v0.2s, v7.2s
+; NEON-AES-NEXT:    mul v18.2s, v0.2s, v18.2s
+; NEON-AES-NEXT:    and v19.8b, v1.8b, v19.8b
+; NEON-AES-NEXT:    movi v17.2s, #4, lsl #8
+; NEON-AES-NEXT:    mul v3.2s, v0.2s, v3.2s
+; NEON-AES-NEXT:    eor v2.8b, v2.8b, v5.8b
+; NEON-AES-NEXT:    and v23.8b, v1.8b, v23.8b
+; NEON-AES-NEXT:    and v4.8b, v1.8b, v4.8b
+; NEON-AES-NEXT:    and v5.8b, v1.8b, v16.8b
+; NEON-AES-NEXT:    movi v16.2s, #64, lsl #16
+; NEON-AES-NEXT:    eor v6.8b, v6.8b, v7.8b
+; NEON-AES-NEXT:    mul v7.2s, v0.2s, v19.2s
+; NEON-AES-NEXT:    movi v19.2s, #4, lsl #16
+; NEON-AES-NEXT:    and v17.8b, v1.8b, v17.8b
+; NEON-AES-NEXT:    eor v3.8b, v18.8b, v3.8b
+; NEON-AES-NEXT:    and v18.8b, v1.8b, v20.8b
+; NEON-AES-NEXT:    movi v20.2s, #1, lsl #24
+; NEON-AES-NEXT:    mul v4.2s, v0.2s, v4.2s
+; NEON-AES-NEXT:    mul v5.2s, v0.2s, v5.2s
+; NEON-AES-NEXT:    and v16.8b, v1.8b, v16.8b
+; NEON-AES-NEXT:    mul v17.2s, v0.2s, v17.2s
+; NEON-AES-NEXT:    eor v3.8b, v3.8b, v7.8b
+; NEON-AES-NEXT:    and v7.8b, v1.8b, v19.8b
+; NEON-AES-NEXT:    mul v18.2s, v0.2s, v18.2s
+; NEON-AES-NEXT:    and v20.8b, v1.8b, v20.8b
+; NEON-AES-NEXT:    movi v19.2s, #128, lsl #8
+; NEON-AES-NEXT:    mul v16.2s, v0.2s, v16.2s
+; NEON-AES-NEXT:    eor v4.8b, v4.8b, v5.8b
+; NEON-AES-NEXT:    mul v5.2s, v0.2s, v7.2s
+; NEON-AES-NEXT:    and v7.8b, v1.8b, v22.8b
+; NEON-AES-NEXT:    movi v22.2s, #16, lsl #16
+; NEON-AES-NEXT:    mul v20.2s, v0.2s, v20.2s
+; NEON-AES-NEXT:    eor v6.8b, v6.8b, v17.8b
+; NEON-AES-NEXT:    eor v3.8b, v3.8b, v18.8b
+; NEON-AES-NEXT:    and v17.8b, v1.8b, v19.8b
+; NEON-AES-NEXT:    mul v18.2s, v0.2s, v23.2s
+; NEON-AES-NEXT:    and v19.8b, v1.8b, v25.8b
+; NEON-AES-NEXT:    eor v16.8b, v16.8b, v21.8b
+; NEON-AES-NEXT:    and v21.8b, v1.8b, v24.8b
+; NEON-AES-NEXT:    movi v23.2s, #32, lsl #24
+; NEON-AES-NEXT:    eor v4.8b, v4.8b, v5.8b
+; NEON-AES-NEXT:    mul v5.2s, v0.2s, v7.2s
+; NEON-AES-NEXT:    and v7.8b, v1.8b, v22.8b
+; NEON-AES-NEXT:    movi v22.2s, #16, lsl #24
+; NEON-AES-NEXT:    movi v24.2s, #64, lsl #24
+; NEON-AES-NEXT:    mul v17.2s, v0.2s, v17.2s
+; NEON-AES-NEXT:    eor v16.8b, v16.8b, v20.8b
+; NEON-AES-NEXT:    and v20.8b, v1.8b, v26.8b
+; NEON-AES-NEXT:    mul v19.2s, v0.2s, v19.2s
+; NEON-AES-NEXT:    mul v7.2s, v0.2s, v7.2s
+; NEON-AES-NEXT:    eor v2.8b, v2.8b, v6.8b
+; NEON-AES-NEXT:    mul v6.2s, v0.2s, v21.2s
+; NEON-AES-NEXT:    eor v4.8b, v4.8b, v5.8b
+; NEON-AES-NEXT:    and v21.8b, v1.8b, v23.8b
+; NEON-AES-NEXT:    eor v5.8b, v16.8b, v18.8b
+; NEON-AES-NEXT:    movi v16.2s, #128, lsl #24
+; NEON-AES-NEXT:    mul v18.2s, v0.2s, v20.2s
+; NEON-AES-NEXT:    and v20.8b, v1.8b, v22.8b
+; NEON-AES-NEXT:    and v22.8b, v1.8b, v24.8b
+; NEON-AES-NEXT:    eor v3.8b, v3.8b, v17.8b
+; NEON-AES-NEXT:    eor v4.8b, v4.8b, v7.8b
+; NEON-AES-NEXT:    eor v5.8b, v5.8b, v19.8b
+; NEON-AES-NEXT:    and v1.8b, v1.8b, v16.8b
+; NEON-AES-NEXT:    mul v7.2s, v0.2s, v20.2s
+; NEON-AES-NEXT:    mul v16.2s, v0.2s, v21.2s
+; NEON-AES-NEXT:    mul v17.2s, v0.2s, v22.2s
+; NEON-AES-NEXT:    eor v2.8b, v2.8b, v3.8b
+; NEON-AES-NEXT:    eor v3.8b, v4.8b, v6.8b
+; NEON-AES-NEXT:    eor v4.8b, v5.8b, v18.8b
+; NEON-AES-NEXT:    mul v0.2s, v0.2s, v1.2s
+; NEON-AES-NEXT:    eor v1.8b, v2.8b, v3.8b
+; NEON-AES-NEXT:    eor v2.8b, v4.8b, v7.8b
+; NEON-AES-NEXT:    eor v3.8b, v16.8b, v17.8b
+; NEON-AES-NEXT:    eor v1.8b, v1.8b, v2.8b
+; NEON-AES-NEXT:    eor v0.8b, v3.8b, v0.8b
+; NEON-AES-NEXT:    eor v0.8b, v1.8b, v0.8b
+; NEON-AES-NEXT:    ret
   %a = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %a
 }
@@ -443,16 +850,8146 @@ define <2 x i32> @clmul_v2i32_neon(<2 x i32> %x, <2 x i32> %y) {
 ;   ret <1 x i64> %a
 ; }
 
-; TODO: Fix: lower with PMULL and PMULL2 .d variants which require FEAT_PMULL
-; define <2 x i64> @clmul_v2i64_neon_aes(<2 x i64> %x, <2 x i64> %y) #0 {
-;   %a = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %x, <2 x i64> %y)
-;   ret <2 x i64> %a
-; }
+define <1 x i128> @clmul_v1i128_neon(<1 x i128> %x, <1 x i128> %y) {
+; NEON-LABEL: clmul_v1i128_neon:
+; NEON:       // %bb.0:
+; NEON-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NEON-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NEON-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NEON-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NEON-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NEON-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NEON-NEXT:    sub sp, sp, #1952
+; NEON-NEXT:    .cfi_def_cfa_offset 2048
+; NEON-NEXT:    .cfi_offset w19, -8
+; NEON-NEXT:    .cfi_offset w20, -16
+; NEON-NEXT:    .cfi_offset w21, -24
+; NEON-NEXT:    .cfi_offset w22, -32
+; NEON-NEXT:    .cfi_offset w23, -40
+; NEON-NEXT:    .cfi_offset w24, -48
+; NEON-NEXT:    .cfi_offset w25, -56
+; NEON-NEXT:    .cfi_offset w26, -64
+; NEON-NEXT:    .cfi_offset w27, -72
+; NEON-NEXT:    .cfi_offset w28, -80
+; NEON-NEXT:    .cfi_offset w30, -88
+; NEON-NEXT:    .cfi_offset w29, -96
+; NEON-NEXT:    rbit x8, x2
+; NEON-NEXT:    rbit x9, x0
+; NEON-NEXT:    and x7, x2, #0x2
+; NEON-NEXT:    and x18, x2, #0x1
+; NEON-NEXT:    and x4, x2, #0x4
+; NEON-NEXT:    and x5, x2, #0x10
+; NEON-NEXT:    and x10, x8, #0x2
+; NEON-NEXT:    and x6, x2, #0x80
+; NEON-NEXT:    and x17, x2, #0x800
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1944] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x1
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1936] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x4
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1928] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x8
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1920] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x10
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1912] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x20
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1904] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x40
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1888] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x80
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1896] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x100
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1880] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x200
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1864] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x400
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1872] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x800
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1856] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x1000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1832] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x2000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1840] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x4000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1824] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x8000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1848] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x10000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1816] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x20000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1792] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x40000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1784] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x80000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1808] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x100000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1776] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x200000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1800] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x400000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1768] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x800000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1728] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x1000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1760] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x2000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1720] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x4000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1752] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x8000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1736] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x10000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1744] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x20000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1696] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x40000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1656] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x80000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1688] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x100000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1672] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x200000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1680] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x400000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1664] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x800000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1704] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x1000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1712] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x2000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1640] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x4000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1608] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x8000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1584] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x10000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1632] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x20000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1600] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x40000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1616] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x80000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1592] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x100000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1624] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x200000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1648] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x400000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1568] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x800000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1520] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x1000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1536] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x2000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1512] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x4000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1560] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x8000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1504] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x10000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1552] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x20000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1528] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x40000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1576] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x80000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1544] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x100000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1480] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x200000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1448] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x400000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1472] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x800000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1456] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x1000000000000000
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x10, [sp, #1488] // 8-byte Spill
+; NEON-NEXT:    and x10, x8, #0x2000000000000000
+; NEON-NEXT:    and x8, x8, #0x4000000000000000
+; NEON-NEXT:    mul x8, x9, x8
+; NEON-NEXT:    mul x10, x9, x10
+; NEON-NEXT:    str x8, [sp, #1496] // 8-byte Spill
+; NEON-NEXT:    mul x8, x1, x7
+; NEON-NEXT:    str x10, [sp, #1464] // 8-byte Spill
+; NEON-NEXT:    str x8, [sp, #1032] // 8-byte Spill
+; NEON-NEXT:    mul x8, x1, x18
+; NEON-NEXT:    mul x18, x0, x18
+; NEON-NEXT:    str x8, [sp, #1008] // 8-byte Spill
+; NEON-NEXT:    mul x8, x1, x4
+; NEON-NEXT:    str x18, [sp, #1128] // 8-byte Spill
+; NEON-NEXT:    mul x18, x0, x4
+; NEON-NEXT:    str x8, [sp, #992] // 8-byte Spill
+; NEON-NEXT:    and x8, x2, #0x8
+; NEON-NEXT:    mul x9, x1, x8
+; NEON-NEXT:    str x18, [sp, #1120] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x8
+; NEON-NEXT:    str x9, [sp, #976] // 8-byte Spill
+; NEON-NEXT:    mul x9, x1, x5
+; NEON-NEXT:    str x8, [sp, #1112] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x5
+; NEON-NEXT:    str x9, [sp, #952] // 8-byte Spill
+; NEON-NEXT:    and x9, x2, #0x20
+; NEON-NEXT:    mul x10, x1, x9
+; NEON-NEXT:    str x8, [sp, #1096] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x9
+; NEON-NEXT:    ldr x9, [sp, #1008] // 8-byte Reload
+; NEON-NEXT:    str x10, [sp, #928] // 8-byte Spill
+; NEON-NEXT:    and x10, x2, #0x40
+; NEON-NEXT:    mul x11, x1, x10
+; NEON-NEXT:    str x8, [sp, #1072] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x10
+; NEON-NEXT:    ldr x10, [sp, #976] // 8-byte Reload
+; NEON-NEXT:    str x11, [sp, #944] // 8-byte Spill
+; NEON-NEXT:    mul x11, x1, x6
+; NEON-NEXT:    str x8, [sp, #1104] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x6
+; NEON-NEXT:    str x11, [sp, #936] // 8-byte Spill
+; NEON-NEXT:    and x11, x2, #0x100
+; NEON-NEXT:    mul x12, x1, x11
+; NEON-NEXT:    str x8, [sp, #1064] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x11
+; NEON-NEXT:    ldr x11, [sp, #928] // 8-byte Reload
+; NEON-NEXT:    str x12, [sp, #920] // 8-byte Spill
+; NEON-NEXT:    and x12, x2, #0x200
+; NEON-NEXT:    mul x13, x1, x12
+; NEON-NEXT:    str x8, [sp, #1048] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x12
+; NEON-NEXT:    str x13, [sp, #968] // 8-byte Spill
+; NEON-NEXT:    and x13, x2, #0x400
+; NEON-NEXT:    mul x14, x1, x13
+; NEON-NEXT:    str x8, [sp, #1040] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x13
+; NEON-NEXT:    str x14, [sp, #960] // 8-byte Spill
+; NEON-NEXT:    mul x14, x1, x17
+; NEON-NEXT:    str x8, [sp, #1088] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x17
+; NEON-NEXT:    str x14, [sp, #848] // 8-byte Spill
+; NEON-NEXT:    and x14, x2, #0x1000
+; NEON-NEXT:    mul x15, x1, x14
+; NEON-NEXT:    str x8, [sp, #1000] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x14
+; NEON-NEXT:    str x15, [sp, #824] // 8-byte Spill
+; NEON-NEXT:    and x15, x2, #0x2000
+; NEON-NEXT:    mul x16, x1, x15
+; NEON-NEXT:    ldr x12, [sp, #824] // 8-byte Reload
+; NEON-NEXT:    str x8, [sp, #984] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x15
+; NEON-NEXT:    str x16, [sp, #896] // 8-byte Spill
+; NEON-NEXT:    and x16, x2, #0x4000
+; NEON-NEXT:    mul x19, x1, x16
+; NEON-NEXT:    str x8, [sp, #1024] // 8-byte Spill
+; NEON-NEXT:    mul x8, x0, x16
+; NEON-NEXT:    str x19, [sp, #888] // 8-byte Spill
+; NEON-NEXT:    and x19, x2, #0x8000
+; NEON-NEXT:    mul x20, x1, x19
+; NEON-NEXT:    str x8, [sp, #1016] // 8-byte Spill
+; NEON-NEXT:    ldr x8, [sp, #1032] // 8-byte Reload
+; NEON-NEXT:    eor x8, x9, x8
+; NEON-NEXT:    ldr x9, [sp, #992] // 8-byte Reload
+; NEON-NEXT:    str x20, [sp, #904] // 8-byte Spill
+; NEON-NEXT:    and x20, x2, #0x10000
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    mul x21, x1, x20
+; NEON-NEXT:    ldr x10, [sp, #952] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    ldr x9, [sp, #944] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    mul x11, x0, x19
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    ldr x10, [sp, #936] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    ldr x9, [sp, #968] // 8-byte Reload
+; NEON-NEXT:    str x21, [sp, #880] // 8-byte Spill
+; NEON-NEXT:    and x21, x2, #0x20000
+; NEON-NEXT:    mul x22, x1, x21
+; NEON-NEXT:    str x11, [sp, #1032] // 8-byte Spill
+; NEON-NEXT:    ldr x11, [sp, #920] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    ldr x11, [sp, #848] // 8-byte Reload
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    ldr x10, [sp, #896] // 8-byte Reload
+; NEON-NEXT:    str x22, [sp, #840] // 8-byte Spill
+; NEON-NEXT:    and x22, x2, #0x40000
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    mul x23, x1, x22
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    ldr x11, [sp, #960] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x11
+; NEON-NEXT:    ldr x11, [sp, #888] // 8-byte Reload
+; NEON-NEXT:    mul x12, x0, x20
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    ldr x9, [sp, #904] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    mul x11, x0, x21
+; NEON-NEXT:    str x23, [sp, #832] // 8-byte Spill
+; NEON-NEXT:    and x23, x2, #0x80000
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    mul x24, x1, x23
+; NEON-NEXT:    ldr x10, [sp, #880] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    str x12, [sp, #1008] // 8-byte Spill
+; NEON-NEXT:    str x11, [sp, #992] // 8-byte Spill
+; NEON-NEXT:    ldr x11, [sp, #840] // 8-byte Reload
+; NEON-NEXT:    str x24, [sp, #872] // 8-byte Spill
+; NEON-NEXT:    and x24, x2, #0x100000
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    mul x25, x1, x24
+; NEON-NEXT:    ldr x11, [sp, #832] // 8-byte Reload
+; NEON-NEXT:    ldr x9, [sp, #872] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    str x25, [sp, #864] // 8-byte Spill
+; NEON-NEXT:    and x25, x2, #0x200000
+; NEON-NEXT:    mul x26, x1, x25
+; NEON-NEXT:    str x26, [sp, #912] // 8-byte Spill
+; NEON-NEXT:    and x26, x2, #0x400000
+; NEON-NEXT:    mul x27, x1, x26
+; NEON-NEXT:    str x27, [sp, #760] // 8-byte Spill
+; NEON-NEXT:    and x27, x2, #0x800000
+; NEON-NEXT:    mul x28, x1, x27
+; NEON-NEXT:    ldr x11, [sp, #760] // 8-byte Reload
+; NEON-NEXT:    str x28, [sp, #736] // 8-byte Spill
+; NEON-NEXT:    and x28, x2, #0x1000000
+; NEON-NEXT:    mul x29, x1, x28
+; NEON-NEXT:    ldr x12, [sp, #736] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    mul x12, x0, x22
+; NEON-NEXT:    str x29, [sp, #784] // 8-byte Spill
+; NEON-NEXT:    and x29, x2, #0x2000000
+; NEON-NEXT:    mul x30, x1, x29
+; NEON-NEXT:    ldr x10, [sp, #784] // 8-byte Reload
+; NEON-NEXT:    str x12, [sp, #976] // 8-byte Spill
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    ldr x11, [sp, #864] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x11
+; NEON-NEXT:    str x30, [sp, #776] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x4000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    ldr x11, [sp, #776] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    mul x11, x0, x23
+; NEON-NEXT:    str x30, [sp, #800] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x4000000
+; NEON-NEXT:    str x30, [sp, #1144] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x8000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x11, [sp, #968] // 8-byte Spill
+; NEON-NEXT:    ldr x11, [sp, #912] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x11
+; NEON-NEXT:    ldr x11, [sp, #800] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    str x30, [sp, #792] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x8000000
+; NEON-NEXT:    str x30, [sp, #1152] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x10000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #816] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x10000000
+; NEON-NEXT:    str x30, [sp, #1160] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x20000000
+; NEON-NEXT:    ldr x9, [sp, #816] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #728] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x20000000
+; NEON-NEXT:    str x30, [sp, #1168] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x40000000
+; NEON-NEXT:    ldr x11, [sp, #728] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #696] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x40000000
+; NEON-NEXT:    str x30, [sp, #1176] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x80000000
+; NEON-NEXT:    ldr x12, [sp, #696] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #792] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    str x30, [sp, #688] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x80000000
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    str x30, [sp, #1184] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x100000000
+; NEON-NEXT:    ldr x12, [sp, #688] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    mul x12, x0, x24
+; NEON-NEXT:    str x30, [sp, #744] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x100000000
+; NEON-NEXT:    str x30, [sp, #1192] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x200000000
+; NEON-NEXT:    ldr x10, [sp, #744] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x12, [sp, #960] // 8-byte Spill
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    str x30, [sp, #720] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x200000000
+; NEON-NEXT:    str x30, [sp, #1200] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x400000000
+; NEON-NEXT:    ldr x11, [sp, #720] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    str x30, [sp, #768] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x400000000
+; NEON-NEXT:    str x30, [sp, #1208] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x800000000
+; NEON-NEXT:    ldr x9, [sp, #768] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    str x30, [sp, #808] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x800000000
+; NEON-NEXT:    str x30, [sp, #1216] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x1000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #856] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x1000000000
+; NEON-NEXT:    str x30, [sp, #1224] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x2000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #648] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x2000000000
+; NEON-NEXT:    str x30, [sp, #1232] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x4000000000
+; NEON-NEXT:    ldr x11, [sp, #648] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #632] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x4000000000
+; NEON-NEXT:    str x30, [sp, #1240] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x8000000000
+; NEON-NEXT:    ldr x12, [sp, #632] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    mul x12, x0, x25
+; NEON-NEXT:    str x30, [sp, #664] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x8000000000
+; NEON-NEXT:    str x30, [sp, #1248] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x10000000000
+; NEON-NEXT:    ldr x10, [sp, #664] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x12, [sp, #952] // 8-byte Spill
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    str x30, [sp, #640] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x10000000000
+; NEON-NEXT:    str x30, [sp, #1256] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x20000000000
+; NEON-NEXT:    ldr x11, [sp, #640] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    str x30, [sp, #680] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x20000000000
+; NEON-NEXT:    str x30, [sp, #1264] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x40000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #672] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x40000000000
+; NEON-NEXT:    str x30, [sp, #1272] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x80000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #712] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x80000000000
+; NEON-NEXT:    str x30, [sp, #1280] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x100000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #704] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x100000000000
+; NEON-NEXT:    str x30, [sp, #1288] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x200000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #752] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x200000000000
+; NEON-NEXT:    str x30, [sp, #1296] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x400000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #520] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x400000000000
+; NEON-NEXT:    str x30, [sp, #1304] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x800000000000
+; NEON-NEXT:    ldr x11, [sp, #520] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #504] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x800000000000
+; NEON-NEXT:    str x30, [sp, #1312] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x1000000000000
+; NEON-NEXT:    ldr x12, [sp, #504] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    mul x12, x0, x26
+; NEON-NEXT:    str x30, [sp, #560] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x1000000000000
+; NEON-NEXT:    str x30, [sp, #1320] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x2000000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x12, [sp, #944] // 8-byte Spill
+; NEON-NEXT:    ldr x12, [sp, #808] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x12
+; NEON-NEXT:    ldr x12, [sp, #680] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    ldr x12, [sp, #560] // 8-byte Reload
+; NEON-NEXT:    str x30, [sp, #552] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x2000000000000
+; NEON-NEXT:    str x30, [sp, #1328] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x4000000000000
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    ldr x12, [sp, #672] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    ldr x12, [sp, #552] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    mul x12, x0, x27
+; NEON-NEXT:    str x30, [sp, #584] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x4000000000000
+; NEON-NEXT:    str x30, [sp, #1336] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x8000000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x12, [sp, #936] // 8-byte Spill
+; NEON-NEXT:    ldr x12, [sp, #856] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x12
+; NEON-NEXT:    ldr x12, [sp, #712] // 8-byte Reload
+; NEON-NEXT:    str x30, [sp, #576] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x8000000000000
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    str x30, [sp, #1344] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x10000000000000
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    ldr x12, [sp, #584] // 8-byte Reload
+; NEON-NEXT:    ldr x9, [sp, #752] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #704] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    ldr x12, [sp, #576] // 8-byte Reload
+; NEON-NEXT:    str x30, [sp, #608] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x10000000000000
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    str x30, [sp, #1352] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x20000000000000
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    ldr x10, [sp, #608] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    mul x12, x0, x28
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    str x30, [sp, #592] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x20000000000000
+; NEON-NEXT:    str x30, [sp, #1360] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x40000000000000
+; NEON-NEXT:    ldr x11, [sp, #592] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x12, [sp, #928] // 8-byte Spill
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    str x30, [sp, #624] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x40000000000000
+; NEON-NEXT:    str x30, [sp, #1368] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x80000000000000
+; NEON-NEXT:    ldr x9, [sp, #624] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    str x30, [sp, #616] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x80000000000000
+; NEON-NEXT:    str x30, [sp, #1376] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x100000000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #528] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x100000000000000
+; NEON-NEXT:    str x30, [sp, #1384] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x200000000000000
+; NEON-NEXT:    ldr x11, [sp, #528] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #512] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x200000000000000
+; NEON-NEXT:    str x30, [sp, #1392] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x400000000000000
+; NEON-NEXT:    ldr x12, [sp, #512] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    mul x12, x0, x29
+; NEON-NEXT:    str x30, [sp, #544] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x400000000000000
+; NEON-NEXT:    str x30, [sp, #1400] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x800000000000000
+; NEON-NEXT:    ldr x10, [sp, #544] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x12, [sp, #920] // 8-byte Spill
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    ldr x11, [sp, #616] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x11
+; NEON-NEXT:    eor x21, x8, x9
+; NEON-NEXT:    str x30, [sp, #536] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x800000000000000
+; NEON-NEXT:    str x30, [sp, #1408] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x1000000000000000
+; NEON-NEXT:    ldr x11, [sp, #536] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    ldr x11, [sp, #1144] // 8-byte Reload
+; NEON-NEXT:    mul x11, x0, x11
+; NEON-NEXT:    str x30, [sp, #568] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x1000000000000000
+; NEON-NEXT:    str x30, [sp, #1416] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x2000000000000000
+; NEON-NEXT:    ldr x8, [sp, #568] // 8-byte Reload
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    eor x8, x10, x8
+; NEON-NEXT:    str x11, [sp, #1144] // 8-byte Spill
+; NEON-NEXT:    str x30, [sp, #600] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x2000000000000000
+; NEON-NEXT:    str x30, [sp, #1424] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x4000000000000000
+; NEON-NEXT:    mul x30, x1, x30
+; NEON-NEXT:    str x30, [sp, #656] // 8-byte Spill
+; NEON-NEXT:    and x30, x2, #0x4000000000000000
+; NEON-NEXT:    and x2, x2, #0x8000000000000000
+; NEON-NEXT:    str x30, [sp, #1432] // 8-byte Spill
+; NEON-NEXT:    mul x30, x1, x2
+; NEON-NEXT:    and x1, x3, #0x2
+; NEON-NEXT:    str x2, [sp, #1440] // 8-byte Spill
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x1
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    stp x1, x2, [sp, #488] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x4
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x8
+; NEON-NEXT:    ldp x10, x9, [sp, #488] // 16-byte Folded Reload
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    str x1, [sp, #456] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x10
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x11, [sp, #456] // 8-byte Reload
+; NEON-NEXT:    stp x2, x1, [sp, #472] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x20
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x10, [sp, #472] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    ldr x11, [sp, #1152] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #480] // 8-byte Reload
+; NEON-NEXT:    mul x11, x0, x11
+; NEON-NEXT:    str x1, [sp, #464] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x40
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x80
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    str x11, [sp, #1152] // 8-byte Spill
+; NEON-NEXT:    ldr x11, [sp, #600] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x11
+; NEON-NEXT:    ldr x11, [sp, #464] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #424] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x100
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    str x1, [sp, #384] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x200
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x12, [sp, #384] // 8-byte Reload
+; NEON-NEXT:    stp x1, x2, [sp, #440] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x400
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x800
+; NEON-NEXT:    ldr x11, [sp, #448] // 8-byte Reload
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    ldr x11, [sp, #424] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #440] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #1160] // 8-byte Reload
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    mul x12, x0, x12
+; NEON-NEXT:    str x1, [sp, #392] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x1000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    str x12, [sp, #1160] // 8-byte Spill
+; NEON-NEXT:    ldr x12, [sp, #656] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #376] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x2000
+; NEON-NEXT:    eor x8, x8, x12
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x12, [sp, #376] // 8-byte Reload
+; NEON-NEXT:    eor x16, x8, x30
+; NEON-NEXT:    stp x1, x2, [sp, #408] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x4000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x11, [sp, #416] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    ldr x11, [sp, #392] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #408] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #1168] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #400] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x8000
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    mul x12, x0, x12
+; NEON-NEXT:    str x1, [sp, #432] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x10000
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x20000
+; NEON-NEXT:    str x12, [sp, #1168] // 8-byte Spill
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    str x1, [sp, #328] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x40000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x12, [sp, #328] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #320] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x80000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    stp x1, x2, [sp, #352] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x100000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x11, [sp, #360] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #400] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    ldr x12, [sp, #320] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #344] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x200000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #1176] // 8-byte Reload
+; NEON-NEXT:    mul x12, x0, x12
+; NEON-NEXT:    str x1, [sp, #368] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x400000
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x800000
+; NEON-NEXT:    str x12, [sp, #1176] // 8-byte Spill
+; NEON-NEXT:    ldr x12, [sp, #432] // 8-byte Reload
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    ldr x12, [sp, #352] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #368] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    str x1, [sp, #232] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x1000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x13, [sp, #232] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #224] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x2000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    stp x2, x1, [sp, #280] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x4000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x12, [sp, #280] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #344] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x13
+; NEON-NEXT:    ldr x13, [sp, #224] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #272] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x8000000
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x10000000
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #1184] // 8-byte Reload
+; NEON-NEXT:    ldr x11, [sp, #288] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x11, x12, x11
+; NEON-NEXT:    ldr x12, [sp, #272] // 8-byte Reload
+; NEON-NEXT:    mul x13, x0, x13
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    stp x1, x2, [sp, #304] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x20000000
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x40000000
+; NEON-NEXT:    ldr x10, [sp, #312] // 8-byte Reload
+; NEON-NEXT:    str x13, [sp, #1184] // 8-byte Spill
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    stp x1, x2, [sp, #168] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x80000000
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x100000000
+; NEON-NEXT:    ldp x13, x12, [sp, #168] // 16-byte Folded Reload
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #1192] // 8-byte Reload
+; NEON-NEXT:    mul x13, x0, x13
+; NEON-NEXT:    stp x1, x2, [sp, #200] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x200000000
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x400000000
+; NEON-NEXT:    ldr x11, [sp, #208] // 8-byte Reload
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x11, x12, x11
+; NEON-NEXT:    ldr x12, [sp, #304] // 8-byte Reload
+; NEON-NEXT:    str x13, [sp, #1192] // 8-byte Spill
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    ldr x12, [sp, #200] // 8-byte Reload
+; NEON-NEXT:    eor x8, x9, x10
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #1200] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #216] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x800000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x10, [sp, #216] // 8-byte Reload
+; NEON-NEXT:    mul x12, x0, x12
+; NEON-NEXT:    stp x2, x1, [sp, #256] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x1000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x9, [sp, #256] // 8-byte Reload
+; NEON-NEXT:    str x12, [sp, #1200] // 8-byte Spill
+; NEON-NEXT:    eor x9, x11, x9
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    str x1, [sp, #336] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x2000000000
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x4000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    str x1, [sp, #136] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x8000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x11, [sp, #136] // 8-byte Reload
+; NEON-NEXT:    stp x1, x2, [sp, #152] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x10000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x10, [sp, #160] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    ldr x11, [sp, #1208] // 8-byte Reload
+; NEON-NEXT:    mul x11, x0, x11
+; NEON-NEXT:    str x1, [sp, #144] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x20000000000
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x40000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    str x11, [sp, #1208] // 8-byte Spill
+; NEON-NEXT:    ldr x11, [sp, #264] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x11
+; NEON-NEXT:    ldr x11, [sp, #152] // 8-byte Reload
+; NEON-NEXT:    stp x1, x2, [sp, #184] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x80000000000
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    mul x2, x0, x1
+; NEON-NEXT:    and x1, x3, #0x100000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    stp x1, x2, [sp, #240] // 16-byte Folded Spill
+; NEON-NEXT:    and x1, x3, #0x200000000000
+; NEON-NEXT:    and x2, x3, #0x1000000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    mul x2, x0, x2
+; NEON-NEXT:    str x1, [sp, #296] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x400000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    str x1, [sp, #40] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x800000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x11, [sp, #40] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #16] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x2000000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x12, [sp, #16] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #144] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x2
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    ldr x12, [sp, #1216] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #56] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x4000000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    mul x23, x0, x12
+; NEON-NEXT:    ldr x12, [sp, #336] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x12
+; NEON-NEXT:    ldr x12, [sp, #192] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #48] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x8000000000000
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    ldr x12, [sp, #56] // 8-byte Reload
+; NEON-NEXT:    ldr x9, [sp, #248] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #184] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    ldr x12, [sp, #48] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #96] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x10000000000000
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x10, [sp, #96] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #1224] // 8-byte Reload
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    ldr x11, [sp, #240] // 8-byte Reload
+; NEON-NEXT:    mul x24, x0, x12
+; NEON-NEXT:    eor x9, x9, x11
+; NEON-NEXT:    str x1, [sp, #88] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x20000000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x11, [sp, #88] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    ldr x11, [sp, #1232] // 8-byte Reload
+; NEON-NEXT:    mul x25, x0, x11
+; NEON-NEXT:    ldr x11, [sp, #296] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #112] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x40000000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x9, x9, x11
+; NEON-NEXT:    ldr x11, [sp, #112] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    str x1, [sp, #104] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x80000000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    str x1, [sp, #120] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x100000000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x9, [sp, #120] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #80] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x200000000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x11, [sp, #80] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #32] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x400000000000000
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x12, [sp, #32] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #104] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x12
+; NEON-NEXT:    str x1, [sp, #24] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x800000000000000
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    ldr x12, [sp, #24] // 8-byte Reload
+; NEON-NEXT:    eor x15, x8, x9
+; NEON-NEXT:    ldr x9, [sp, #1248] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #1240] // 8-byte Reload
+; NEON-NEXT:    mul x27, x0, x9
+; NEON-NEXT:    ldr x9, [sp, #1944] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #72] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x1000000000000000
+; NEON-NEXT:    mul x26, x0, x12
+; NEON-NEXT:    ldr x10, [sp, #72] // 8-byte Reload
+; NEON-NEXT:    ldr x12, [sp, #1904] // 8-byte Reload
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    ldr x11, [sp, #1920] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #64] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x2000000000000000
+; NEON-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x8, x10, x8
+; NEON-NEXT:    ldr x10, [sp, #1936] // 8-byte Reload
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    ldr x10, [sp, #1928] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #128] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x4000000000000000
+; NEON-NEXT:    eor x10, x10, x11
+; NEON-NEXT:    ldr x11, [sp, #1912] // 8-byte Reload
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #1888] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #1792] // 8-byte Reload
+; NEON-NEXT:    eor x10, x11, x10
+; NEON-NEXT:    ldr x11, [sp, #1256] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #1880] // 8-byte Reload
+; NEON-NEXT:    mul x28, x0, x11
+; NEON-NEXT:    ldr x11, [sp, #128] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #1056] // 8-byte Spill
+; NEON-NEXT:    and x1, x3, #0x8000000000000000
+; NEON-NEXT:    eor x14, x8, x11
+; NEON-NEXT:    ldr x8, [sp, #1896] // 8-byte Reload
+; NEON-NEXT:    ldr x11, [sp, #1832] // 8-byte Reload
+; NEON-NEXT:    mul x1, x0, x1
+; NEON-NEXT:    eor x10, x8, x10
+; NEON-NEXT:    ldr x8, [sp, #1864] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x8
+; NEON-NEXT:    ldr x8, [sp, #1856] // 8-byte Reload
+; NEON-NEXT:    eor x11, x8, x11
+; NEON-NEXT:    ldr x8, [sp, #1264] // 8-byte Reload
+; NEON-NEXT:    str x1, [sp, #1080] // 8-byte Spill
+; NEON-NEXT:    mul x1, x0, x7
+; NEON-NEXT:    mul x29, x0, x8
+; NEON-NEXT:    ldr x8, [sp, #1872] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x8
+; NEON-NEXT:    ldr x8, [sp, #1840] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    str x1, [sp, #1136] // 8-byte Spill
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1816] // 8-byte Reload
+; NEON-NEXT:    eor x12, x8, x12
+; NEON-NEXT:    ldr x8, [sp, #1824] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1784] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x8
+; NEON-NEXT:    ldr x8, [sp, #1272] // 8-byte Reload
+; NEON-NEXT:    mul x30, x0, x8
+; NEON-NEXT:    ldr x8, [sp, #1848] // 8-byte Reload
+; NEON-NEXT:    eor x10, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1808] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    eor x11, x12, x8
+; NEON-NEXT:    ldr x8, [sp, #1776] // 8-byte Reload
+; NEON-NEXT:    ldr x12, [sp, #1728] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1768] // 8-byte Reload
+; NEON-NEXT:    eor x12, x8, x12
+; NEON-NEXT:    ldr x8, [sp, #1280] // 8-byte Reload
+; NEON-NEXT:    mul x22, x0, x8
+; NEON-NEXT:    ldr x8, [sp, #1800] // 8-byte Reload
+; NEON-NEXT:    eor x10, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1760] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    eor x11, x12, x8
+; NEON-NEXT:    ldr x8, [sp, #1720] // 8-byte Reload
+; NEON-NEXT:    ldr x12, [sp, #1656] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1696] // 8-byte Reload
+; NEON-NEXT:    eor x12, x8, x12
+; NEON-NEXT:    ldr x8, [sp, #1288] // 8-byte Reload
+; NEON-NEXT:    mul x20, x0, x8
+; NEON-NEXT:    ldr x8, [sp, #1752] // 8-byte Reload
+; NEON-NEXT:    eor x10, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1688] // 8-byte Reload
+; NEON-NEXT:    eor x11, x12, x8
+; NEON-NEXT:    ldr x8, [sp, #1736] // 8-byte Reload
+; NEON-NEXT:    ldr x12, [sp, #1608] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x8
+; NEON-NEXT:    ldr x8, [sp, #1672] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1296] // 8-byte Reload
+; NEON-NEXT:    mul x19, x0, x8
+; NEON-NEXT:    ldr x8, [sp, #1744] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x8
+; NEON-NEXT:    ldr x8, [sp, #1680] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1640] // 8-byte Reload
+; NEON-NEXT:    eor x12, x8, x12
+; NEON-NEXT:    ldr x8, [sp, #1664] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1584] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x8
+; NEON-NEXT:    ldr x8, [sp, #1304] // 8-byte Reload
+; NEON-NEXT:    mul x7, x0, x8
+; NEON-NEXT:    ldr x8, [sp, #1704] // 8-byte Reload
+; NEON-NEXT:    eor x10, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1632] // 8-byte Reload
+; NEON-NEXT:    eor x11, x12, x8
+; NEON-NEXT:    ldr x8, [sp, #1600] // 8-byte Reload
+; NEON-NEXT:    ldr x12, [sp, #1520] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1568] // 8-byte Reload
+; NEON-NEXT:    eor x12, x8, x12
+; NEON-NEXT:    ldr x8, [sp, #1312] // 8-byte Reload
+; NEON-NEXT:    mul x6, x0, x8
+; NEON-NEXT:    ldr x8, [sp, #1712] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x8
+; NEON-NEXT:    ldr x8, [sp, #1616] // 8-byte Reload
+; NEON-NEXT:    eor x13, x9, x10
+; NEON-NEXT:    ldr x9, [sp, #1448] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1536] // 8-byte Reload
+; NEON-NEXT:    eor x6, x7, x6
+; NEON-NEXT:    ldr x7, [sp, #1424] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x8
+; NEON-NEXT:    ldr x8, [sp, #1592] // 8-byte Reload
+; NEON-NEXT:    mul x7, x0, x7
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1512] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x8
+; NEON-NEXT:    ldr x8, [sp, #1320] // 8-byte Reload
+; NEON-NEXT:    mul x5, x0, x8
+; NEON-NEXT:    ldr x8, [sp, #1624] // 8-byte Reload
+; NEON-NEXT:    eor x10, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1560] // 8-byte Reload
+; NEON-NEXT:    eor x11, x12, x8
+; NEON-NEXT:    ldr x8, [sp, #1504] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x8
+; NEON-NEXT:    ldr x8, [sp, #1480] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    ldr x9, [sp, #1328] // 8-byte Reload
+; NEON-NEXT:    mul x4, x0, x9
+; NEON-NEXT:    ldr x9, [sp, #1648] // 8-byte Reload
+; NEON-NEXT:    eor x10, x10, x9
+; NEON-NEXT:    ldr x9, [sp, #1552] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x10
+; NEON-NEXT:    ldr x10, [sp, #1576] // 8-byte Reload
+; NEON-NEXT:    eor x11, x11, x9
+; NEON-NEXT:    ldr x9, [sp, #1472] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    ldr x9, [sp, #1528] // 8-byte Reload
+; NEON-NEXT:    eor x9, x11, x9
+; NEON-NEXT:    ldr x11, [sp, #1456] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #1488] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x11
+; NEON-NEXT:    ldr x11, [sp, #1336] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x10
+; NEON-NEXT:    ldr x10, [sp, #1544] // 8-byte Reload
+; NEON-NEXT:    mul x3, x0, x11
+; NEON-NEXT:    ldr x11, [sp, #992] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #1464] // 8-byte Reload
+; NEON-NEXT:    eor x9, x13, x9
+; NEON-NEXT:    eor x8, x8, x10
+; NEON-NEXT:    ldr x10, [sp, #1344] // 8-byte Reload
+; NEON-NEXT:    mul x1, x0, x10
+; NEON-NEXT:    ldr x10, [sp, #1056] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x10
+; NEON-NEXT:    ldr x10, [sp, #1496] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x10
+; NEON-NEXT:    ldr x10, [sp, #1080] // 8-byte Reload
+; NEON-NEXT:    eor x8, x9, x8
+; NEON-NEXT:    ldr x9, [sp, #1352] // 8-byte Reload
+; NEON-NEXT:    eor x13, x14, x10
+; NEON-NEXT:    eor x14, x21, x16
+; NEON-NEXT:    rbit x8, x8
+; NEON-NEXT:    mul x18, x0, x9
+; NEON-NEXT:    eor x13, x15, x13
+; NEON-NEXT:    ldr x9, [sp, #1360] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x10, [sp, #1112] // 8-byte Reload
+; NEON-NEXT:    mul x17, x0, x9
+; NEON-NEXT:    eor x2, x13, x8, lsr #1
+; NEON-NEXT:    ldr x8, [sp, #1136] // 8-byte Reload
+; NEON-NEXT:    ldr x9, [sp, #1128] // 8-byte Reload
+; NEON-NEXT:    eor x8, x9, x8
+; NEON-NEXT:    ldr x9, [sp, #1120] // 8-byte Reload
+; NEON-NEXT:    eor x13, x9, x10
+; NEON-NEXT:    ldr x9, [sp, #1096] // 8-byte Reload
+; NEON-NEXT:    ldr x10, [sp, #1072] // 8-byte Reload
+; NEON-NEXT:    eor x13, x8, x13
+; NEON-NEXT:    ldr x8, [sp, #1104] // 8-byte Reload
+; NEON-NEXT:    eor x14, x9, x10
+; NEON-NEXT:    ldr x9, [sp, #1368] // 8-byte Reload
+; NEON-NEXT:    ldr x10, [sp, #984] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x8
+; NEON-NEXT:    ldr x8, [sp, #1064] // 8-byte Reload
+; NEON-NEXT:    mul x21, x0, x9
+; NEON-NEXT:    ldr x9, [sp, #1048] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    ldr x9, [sp, #1040] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    ldr x9, [sp, #1000] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #1376] // 8-byte Reload
+; NEON-NEXT:    mul x16, x0, x10
+; NEON-NEXT:    ldr x10, [sp, #1088] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x10
+; NEON-NEXT:    ldr x10, [sp, #1024] // 8-byte Reload
+; NEON-NEXT:    eor x8, x13, x8
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #1016] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #1008] // 8-byte Reload
+; NEON-NEXT:    eor x14, x10, x11
+; NEON-NEXT:    ldr x10, [sp, #1384] // 8-byte Reload
+; NEON-NEXT:    ldr x11, [sp, #936] // 8-byte Reload
+; NEON-NEXT:    mul x15, x0, x10
+; NEON-NEXT:    ldr x10, [sp, #1032] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #976] // 8-byte Reload
+; NEON-NEXT:    eor x13, x14, x10
+; NEON-NEXT:    eor x10, x8, x9
+; NEON-NEXT:    ldr x8, [sp, #968] // 8-byte Reload
+; NEON-NEXT:    eor x9, x13, x8
+; NEON-NEXT:    ldr x8, [sp, #1392] // 8-byte Reload
+; NEON-NEXT:    mul x13, x0, x8
+; NEON-NEXT:    ldr x8, [sp, #960] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x8
+; NEON-NEXT:    ldr x8, [sp, #944] // 8-byte Reload
+; NEON-NEXT:    eor x14, x8, x11
+; NEON-NEXT:    ldr x8, [sp, #1168] // 8-byte Reload
+; NEON-NEXT:    ldr x11, [sp, #1176] // 8-byte Reload
+; NEON-NEXT:    eor x13, x15, x13
+; NEON-NEXT:    eor x8, x8, x11
+; NEON-NEXT:    ldr x11, [sp, #928] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x11
+; NEON-NEXT:    ldr x11, [sp, #1184] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x11
+; NEON-NEXT:    ldr x11, [sp, #1400] // 8-byte Reload
+; NEON-NEXT:    mul x12, x0, x11
+; NEON-NEXT:    ldr x11, [sp, #952] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x11
+; NEON-NEXT:    ldr x11, [sp, #920] // 8-byte Reload
+; NEON-NEXT:    eor x9, x10, x9
+; NEON-NEXT:    ldr x10, [sp, #1152] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x11
+; NEON-NEXT:    ldr x11, [sp, #1192] // 8-byte Reload
+; NEON-NEXT:    eor x12, x13, x12
+; NEON-NEXT:    ldr x13, [sp, #1440] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x11
+; NEON-NEXT:    ldr x11, [sp, #1144] // 8-byte Reload
+; NEON-NEXT:    mul x13, x0, x13
+; NEON-NEXT:    eor x14, x14, x11
+; NEON-NEXT:    ldr x11, [sp, #1200] // 8-byte Reload
+; NEON-NEXT:    eor x10, x14, x10
+; NEON-NEXT:    ldr x14, [sp, #1208] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x11
+; NEON-NEXT:    ldr x11, [sp, #1408] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x14
+; NEON-NEXT:    ldr x14, [sp, #1160] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x23
+; NEON-NEXT:    mul x11, x0, x11
+; NEON-NEXT:    eor x10, x10, x14
+; NEON-NEXT:    ldr x14, [sp, #1416] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x24
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    eor x10, x25, x26
+; NEON-NEXT:    eor x10, x10, x27
+; NEON-NEXT:    eor x8, x9, x8
+; NEON-NEXT:    mul x14, x0, x14
+; NEON-NEXT:    eor x9, x10, x28
+; NEON-NEXT:    eor x10, x6, x5
+; NEON-NEXT:    eor x10, x10, x4
+; NEON-NEXT:    ldr x4, [sp, #1432] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x29
+; NEON-NEXT:    eor x10, x10, x3
+; NEON-NEXT:    eor x9, x9, x30
+; NEON-NEXT:    eor x11, x12, x11
+; NEON-NEXT:    mul x4, x0, x4
+; NEON-NEXT:    eor x10, x10, x1
+; NEON-NEXT:    eor x9, x9, x22
+; NEON-NEXT:    eor x10, x10, x18
+; NEON-NEXT:    eor x9, x9, x20
+; NEON-NEXT:    eor x11, x11, x14
+; NEON-NEXT:    eor x10, x10, x17
+; NEON-NEXT:    eor x9, x9, x19
+; NEON-NEXT:    eor x11, x11, x7
+; NEON-NEXT:    eor x10, x10, x21
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    mov x1, x2
+; NEON-NEXT:    eor x9, x10, x16
+; NEON-NEXT:    eor x10, x11, x4
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    eor x9, x10, x13
+; NEON-NEXT:    eor x0, x8, x9
+; NEON-NEXT:    add sp, sp, #1952
+; NEON-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v1i128_neon:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NEON-AES-NEXT:    sub sp, sp, #1952
+; NEON-AES-NEXT:    .cfi_def_cfa_offset 2048
+; NEON-AES-NEXT:    .cfi_offset w19, -8
+; NEON-AES-NEXT:    .cfi_offset w20, -16
+; NEON-AES-NEXT:    .cfi_offset w21, -24
+; NEON-AES-NEXT:    .cfi_offset w22, -32
+; NEON-AES-NEXT:    .cfi_offset w23, -40
+; NEON-AES-NEXT:    .cfi_offset w24, -48
+; NEON-AES-NEXT:    .cfi_offset w25, -56
+; NEON-AES-NEXT:    .cfi_offset w26, -64
+; NEON-AES-NEXT:    .cfi_offset w27, -72
+; NEON-AES-NEXT:    .cfi_offset w28, -80
+; NEON-AES-NEXT:    .cfi_offset w30, -88
+; NEON-AES-NEXT:    .cfi_offset w29, -96
+; NEON-AES-NEXT:    rbit x8, x2
+; NEON-AES-NEXT:    rbit x9, x0
+; NEON-AES-NEXT:    and x7, x2, #0x2
+; NEON-AES-NEXT:    and x18, x2, #0x1
+; NEON-AES-NEXT:    and x4, x2, #0x4
+; NEON-AES-NEXT:    and x5, x2, #0x10
+; NEON-AES-NEXT:    and x10, x8, #0x2
+; NEON-AES-NEXT:    and x6, x2, #0x80
+; NEON-AES-NEXT:    and x17, x2, #0x800
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1944] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x1
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1936] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x4
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1928] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x8
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1920] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x10
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1912] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x20
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1904] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x40
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1888] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x80
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1896] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x100
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1880] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x200
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1864] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x400
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1872] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x800
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1856] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x1000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1832] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x2000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1840] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x4000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1824] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x8000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1848] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x10000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1816] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x20000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1792] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x40000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1784] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x80000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1808] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x100000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1776] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x200000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1800] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x400000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1768] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x800000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1728] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x1000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1760] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x2000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1720] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x4000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1752] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x8000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1736] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x10000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1744] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x20000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1696] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x40000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1656] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x80000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1688] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x100000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1672] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x200000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1680] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x400000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1664] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x800000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1704] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x1000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1712] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x2000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1640] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x4000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1608] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x8000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1584] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x10000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1632] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x20000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1600] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x40000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1616] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x80000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1592] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x100000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1624] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x200000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1648] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x400000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1568] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x800000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1520] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x1000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1536] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x2000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1512] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x4000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1560] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x8000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1504] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x10000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1552] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x20000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1528] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x40000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1576] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x80000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1544] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x100000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1480] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x200000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1448] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x400000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1472] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x800000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1456] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x1000000000000000
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x10, [sp, #1488] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x8, #0x2000000000000000
+; NEON-AES-NEXT:    and x8, x8, #0x4000000000000000
+; NEON-AES-NEXT:    mul x8, x9, x8
+; NEON-AES-NEXT:    mul x10, x9, x10
+; NEON-AES-NEXT:    str x8, [sp, #1496] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x1, x7
+; NEON-AES-NEXT:    str x10, [sp, #1464] // 8-byte Spill
+; NEON-AES-NEXT:    str x8, [sp, #1032] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x1, x18
+; NEON-AES-NEXT:    mul x18, x0, x18
+; NEON-AES-NEXT:    str x8, [sp, #1008] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x1, x4
+; NEON-AES-NEXT:    str x18, [sp, #1128] // 8-byte Spill
+; NEON-AES-NEXT:    mul x18, x0, x4
+; NEON-AES-NEXT:    str x8, [sp, #992] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x2, #0x8
+; NEON-AES-NEXT:    mul x9, x1, x8
+; NEON-AES-NEXT:    str x18, [sp, #1120] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x8
+; NEON-AES-NEXT:    str x9, [sp, #976] // 8-byte Spill
+; NEON-AES-NEXT:    mul x9, x1, x5
+; NEON-AES-NEXT:    str x8, [sp, #1112] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x5
+; NEON-AES-NEXT:    str x9, [sp, #952] // 8-byte Spill
+; NEON-AES-NEXT:    and x9, x2, #0x20
+; NEON-AES-NEXT:    mul x10, x1, x9
+; NEON-AES-NEXT:    str x8, [sp, #1096] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1008] // 8-byte Reload
+; NEON-AES-NEXT:    str x10, [sp, #928] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x2, #0x40
+; NEON-AES-NEXT:    mul x11, x1, x10
+; NEON-AES-NEXT:    str x8, [sp, #1072] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #976] // 8-byte Reload
+; NEON-AES-NEXT:    str x11, [sp, #944] // 8-byte Spill
+; NEON-AES-NEXT:    mul x11, x1, x6
+; NEON-AES-NEXT:    str x8, [sp, #1104] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x6
+; NEON-AES-NEXT:    str x11, [sp, #936] // 8-byte Spill
+; NEON-AES-NEXT:    and x11, x2, #0x100
+; NEON-AES-NEXT:    mul x12, x1, x11
+; NEON-AES-NEXT:    str x8, [sp, #1064] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #928] // 8-byte Reload
+; NEON-AES-NEXT:    str x12, [sp, #920] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x2, #0x200
+; NEON-AES-NEXT:    mul x13, x1, x12
+; NEON-AES-NEXT:    str x8, [sp, #1048] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x12
+; NEON-AES-NEXT:    str x13, [sp, #968] // 8-byte Spill
+; NEON-AES-NEXT:    and x13, x2, #0x400
+; NEON-AES-NEXT:    mul x14, x1, x13
+; NEON-AES-NEXT:    str x8, [sp, #1040] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x13
+; NEON-AES-NEXT:    str x14, [sp, #960] // 8-byte Spill
+; NEON-AES-NEXT:    mul x14, x1, x17
+; NEON-AES-NEXT:    str x8, [sp, #1088] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x17
+; NEON-AES-NEXT:    str x14, [sp, #848] // 8-byte Spill
+; NEON-AES-NEXT:    and x14, x2, #0x1000
+; NEON-AES-NEXT:    mul x15, x1, x14
+; NEON-AES-NEXT:    str x8, [sp, #1000] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x14
+; NEON-AES-NEXT:    str x15, [sp, #824] // 8-byte Spill
+; NEON-AES-NEXT:    and x15, x2, #0x2000
+; NEON-AES-NEXT:    mul x16, x1, x15
+; NEON-AES-NEXT:    ldr x12, [sp, #824] // 8-byte Reload
+; NEON-AES-NEXT:    str x8, [sp, #984] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x15
+; NEON-AES-NEXT:    str x16, [sp, #896] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x2, #0x4000
+; NEON-AES-NEXT:    mul x19, x1, x16
+; NEON-AES-NEXT:    str x8, [sp, #1024] // 8-byte Spill
+; NEON-AES-NEXT:    mul x8, x0, x16
+; NEON-AES-NEXT:    str x19, [sp, #888] // 8-byte Spill
+; NEON-AES-NEXT:    and x19, x2, #0x8000
+; NEON-AES-NEXT:    mul x20, x1, x19
+; NEON-AES-NEXT:    str x8, [sp, #1016] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x8, [sp, #1032] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x9, x8
+; NEON-AES-NEXT:    ldr x9, [sp, #992] // 8-byte Reload
+; NEON-AES-NEXT:    str x20, [sp, #904] // 8-byte Spill
+; NEON-AES-NEXT:    and x20, x2, #0x10000
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    mul x21, x1, x20
+; NEON-AES-NEXT:    ldr x10, [sp, #952] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #944] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    mul x11, x0, x19
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    ldr x10, [sp, #936] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #968] // 8-byte Reload
+; NEON-AES-NEXT:    str x21, [sp, #880] // 8-byte Spill
+; NEON-AES-NEXT:    and x21, x2, #0x20000
+; NEON-AES-NEXT:    mul x22, x1, x21
+; NEON-AES-NEXT:    str x11, [sp, #1032] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x11, [sp, #920] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #848] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    ldr x10, [sp, #896] // 8-byte Reload
+; NEON-AES-NEXT:    str x22, [sp, #840] // 8-byte Spill
+; NEON-AES-NEXT:    and x22, x2, #0x40000
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    mul x23, x1, x22
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    ldr x11, [sp, #960] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #888] // 8-byte Reload
+; NEON-AES-NEXT:    mul x12, x0, x20
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #904] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    mul x11, x0, x21
+; NEON-AES-NEXT:    str x23, [sp, #832] // 8-byte Spill
+; NEON-AES-NEXT:    and x23, x2, #0x80000
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    mul x24, x1, x23
+; NEON-AES-NEXT:    ldr x10, [sp, #880] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    str x12, [sp, #1008] // 8-byte Spill
+; NEON-AES-NEXT:    str x11, [sp, #992] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x11, [sp, #840] // 8-byte Reload
+; NEON-AES-NEXT:    str x24, [sp, #872] // 8-byte Spill
+; NEON-AES-NEXT:    and x24, x2, #0x100000
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    mul x25, x1, x24
+; NEON-AES-NEXT:    ldr x11, [sp, #832] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x9, [sp, #872] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    str x25, [sp, #864] // 8-byte Spill
+; NEON-AES-NEXT:    and x25, x2, #0x200000
+; NEON-AES-NEXT:    mul x26, x1, x25
+; NEON-AES-NEXT:    str x26, [sp, #912] // 8-byte Spill
+; NEON-AES-NEXT:    and x26, x2, #0x400000
+; NEON-AES-NEXT:    mul x27, x1, x26
+; NEON-AES-NEXT:    str x27, [sp, #760] // 8-byte Spill
+; NEON-AES-NEXT:    and x27, x2, #0x800000
+; NEON-AES-NEXT:    mul x28, x1, x27
+; NEON-AES-NEXT:    ldr x11, [sp, #760] // 8-byte Reload
+; NEON-AES-NEXT:    str x28, [sp, #736] // 8-byte Spill
+; NEON-AES-NEXT:    and x28, x2, #0x1000000
+; NEON-AES-NEXT:    mul x29, x1, x28
+; NEON-AES-NEXT:    ldr x12, [sp, #736] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    mul x12, x0, x22
+; NEON-AES-NEXT:    str x29, [sp, #784] // 8-byte Spill
+; NEON-AES-NEXT:    and x29, x2, #0x2000000
+; NEON-AES-NEXT:    mul x30, x1, x29
+; NEON-AES-NEXT:    ldr x10, [sp, #784] // 8-byte Reload
+; NEON-AES-NEXT:    str x12, [sp, #976] // 8-byte Spill
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    ldr x11, [sp, #864] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x11
+; NEON-AES-NEXT:    str x30, [sp, #776] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x4000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    ldr x11, [sp, #776] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    mul x11, x0, x23
+; NEON-AES-NEXT:    str x30, [sp, #800] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x4000000
+; NEON-AES-NEXT:    str x30, [sp, #1144] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x8000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x11, [sp, #968] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x11, [sp, #912] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #800] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    str x30, [sp, #792] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x8000000
+; NEON-AES-NEXT:    str x30, [sp, #1152] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x10000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #816] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x10000000
+; NEON-AES-NEXT:    str x30, [sp, #1160] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x20000000
+; NEON-AES-NEXT:    ldr x9, [sp, #816] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #728] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x20000000
+; NEON-AES-NEXT:    str x30, [sp, #1168] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x40000000
+; NEON-AES-NEXT:    ldr x11, [sp, #728] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #696] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x40000000
+; NEON-AES-NEXT:    str x30, [sp, #1176] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x80000000
+; NEON-AES-NEXT:    ldr x12, [sp, #696] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #792] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    str x30, [sp, #688] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x80000000
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    str x30, [sp, #1184] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x100000000
+; NEON-AES-NEXT:    ldr x12, [sp, #688] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    mul x12, x0, x24
+; NEON-AES-NEXT:    str x30, [sp, #744] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x100000000
+; NEON-AES-NEXT:    str x30, [sp, #1192] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x200000000
+; NEON-AES-NEXT:    ldr x10, [sp, #744] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x12, [sp, #960] // 8-byte Spill
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    str x30, [sp, #720] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x200000000
+; NEON-AES-NEXT:    str x30, [sp, #1200] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x400000000
+; NEON-AES-NEXT:    ldr x11, [sp, #720] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    str x30, [sp, #768] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x400000000
+; NEON-AES-NEXT:    str x30, [sp, #1208] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x800000000
+; NEON-AES-NEXT:    ldr x9, [sp, #768] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    str x30, [sp, #808] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x800000000
+; NEON-AES-NEXT:    str x30, [sp, #1216] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x1000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #856] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x1000000000
+; NEON-AES-NEXT:    str x30, [sp, #1224] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x2000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #648] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x2000000000
+; NEON-AES-NEXT:    str x30, [sp, #1232] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x4000000000
+; NEON-AES-NEXT:    ldr x11, [sp, #648] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #632] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x4000000000
+; NEON-AES-NEXT:    str x30, [sp, #1240] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x8000000000
+; NEON-AES-NEXT:    ldr x12, [sp, #632] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    mul x12, x0, x25
+; NEON-AES-NEXT:    str x30, [sp, #664] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x8000000000
+; NEON-AES-NEXT:    str x30, [sp, #1248] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x10000000000
+; NEON-AES-NEXT:    ldr x10, [sp, #664] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x12, [sp, #952] // 8-byte Spill
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    str x30, [sp, #640] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x10000000000
+; NEON-AES-NEXT:    str x30, [sp, #1256] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x20000000000
+; NEON-AES-NEXT:    ldr x11, [sp, #640] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    str x30, [sp, #680] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x20000000000
+; NEON-AES-NEXT:    str x30, [sp, #1264] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x40000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #672] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x40000000000
+; NEON-AES-NEXT:    str x30, [sp, #1272] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x80000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #712] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x80000000000
+; NEON-AES-NEXT:    str x30, [sp, #1280] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x100000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #704] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x100000000000
+; NEON-AES-NEXT:    str x30, [sp, #1288] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x200000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #752] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x200000000000
+; NEON-AES-NEXT:    str x30, [sp, #1296] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x400000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #520] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x400000000000
+; NEON-AES-NEXT:    str x30, [sp, #1304] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x800000000000
+; NEON-AES-NEXT:    ldr x11, [sp, #520] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #504] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x800000000000
+; NEON-AES-NEXT:    str x30, [sp, #1312] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x1000000000000
+; NEON-AES-NEXT:    ldr x12, [sp, #504] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    mul x12, x0, x26
+; NEON-AES-NEXT:    str x30, [sp, #560] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x1000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1320] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x2000000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x12, [sp, #944] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x12, [sp, #808] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #680] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #560] // 8-byte Reload
+; NEON-AES-NEXT:    str x30, [sp, #552] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x2000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1328] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x4000000000000
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    ldr x12, [sp, #672] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #552] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    mul x12, x0, x27
+; NEON-AES-NEXT:    str x30, [sp, #584] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x4000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1336] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x8000000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x12, [sp, #936] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x12, [sp, #856] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #712] // 8-byte Reload
+; NEON-AES-NEXT:    str x30, [sp, #576] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x8000000000000
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    str x30, [sp, #1344] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x10000000000000
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    ldr x12, [sp, #584] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x9, [sp, #752] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #704] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #576] // 8-byte Reload
+; NEON-AES-NEXT:    str x30, [sp, #608] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x10000000000000
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    str x30, [sp, #1352] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x20000000000000
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    ldr x10, [sp, #608] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    mul x12, x0, x28
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    str x30, [sp, #592] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x20000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1360] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x40000000000000
+; NEON-AES-NEXT:    ldr x11, [sp, #592] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x12, [sp, #928] // 8-byte Spill
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    str x30, [sp, #624] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x40000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1368] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x80000000000000
+; NEON-AES-NEXT:    ldr x9, [sp, #624] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    str x30, [sp, #616] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x80000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1376] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x100000000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #528] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x100000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1384] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x200000000000000
+; NEON-AES-NEXT:    ldr x11, [sp, #528] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #512] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x200000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1392] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x400000000000000
+; NEON-AES-NEXT:    ldr x12, [sp, #512] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    mul x12, x0, x29
+; NEON-AES-NEXT:    str x30, [sp, #544] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x400000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1400] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x800000000000000
+; NEON-AES-NEXT:    ldr x10, [sp, #544] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x12, [sp, #920] // 8-byte Spill
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    ldr x11, [sp, #616] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x11
+; NEON-AES-NEXT:    eor x21, x8, x9
+; NEON-AES-NEXT:    str x30, [sp, #536] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x800000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1408] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x1000000000000000
+; NEON-AES-NEXT:    ldr x11, [sp, #536] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1144] // 8-byte Reload
+; NEON-AES-NEXT:    mul x11, x0, x11
+; NEON-AES-NEXT:    str x30, [sp, #568] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x1000000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1416] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x2000000000000000
+; NEON-AES-NEXT:    ldr x8, [sp, #568] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    eor x8, x10, x8
+; NEON-AES-NEXT:    str x11, [sp, #1144] // 8-byte Spill
+; NEON-AES-NEXT:    str x30, [sp, #600] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x2000000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1424] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x4000000000000000
+; NEON-AES-NEXT:    mul x30, x1, x30
+; NEON-AES-NEXT:    str x30, [sp, #656] // 8-byte Spill
+; NEON-AES-NEXT:    and x30, x2, #0x4000000000000000
+; NEON-AES-NEXT:    and x2, x2, #0x8000000000000000
+; NEON-AES-NEXT:    str x30, [sp, #1432] // 8-byte Spill
+; NEON-AES-NEXT:    mul x30, x1, x2
+; NEON-AES-NEXT:    and x1, x3, #0x2
+; NEON-AES-NEXT:    str x2, [sp, #1440] // 8-byte Spill
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x1
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    stp x1, x2, [sp, #488] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x4
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x8
+; NEON-AES-NEXT:    ldp x10, x9, [sp, #488] // 16-byte Folded Reload
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    str x1, [sp, #456] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x10
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x11, [sp, #456] // 8-byte Reload
+; NEON-AES-NEXT:    stp x2, x1, [sp, #472] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x20
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x10, [sp, #472] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1152] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #480] // 8-byte Reload
+; NEON-AES-NEXT:    mul x11, x0, x11
+; NEON-AES-NEXT:    str x1, [sp, #464] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x40
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x80
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    str x11, [sp, #1152] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x11, [sp, #600] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #464] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #424] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x100
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    str x1, [sp, #384] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x200
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x12, [sp, #384] // 8-byte Reload
+; NEON-AES-NEXT:    stp x1, x2, [sp, #440] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x400
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x800
+; NEON-AES-NEXT:    ldr x11, [sp, #448] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #424] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #440] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1160] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    mul x12, x0, x12
+; NEON-AES-NEXT:    str x1, [sp, #392] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x1000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    str x12, [sp, #1160] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x12, [sp, #656] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #376] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x2000
+; NEON-AES-NEXT:    eor x8, x8, x12
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x12, [sp, #376] // 8-byte Reload
+; NEON-AES-NEXT:    eor x16, x8, x30
+; NEON-AES-NEXT:    stp x1, x2, [sp, #408] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x4000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x11, [sp, #416] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #392] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #408] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1168] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #400] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x8000
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    mul x12, x0, x12
+; NEON-AES-NEXT:    str x1, [sp, #432] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x10000
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x20000
+; NEON-AES-NEXT:    str x12, [sp, #1168] // 8-byte Spill
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    str x1, [sp, #328] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x40000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x12, [sp, #328] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #320] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x80000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    stp x1, x2, [sp, #352] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x100000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x11, [sp, #360] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #400] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #320] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #344] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x200000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1176] // 8-byte Reload
+; NEON-AES-NEXT:    mul x12, x0, x12
+; NEON-AES-NEXT:    str x1, [sp, #368] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x400000
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x800000
+; NEON-AES-NEXT:    str x12, [sp, #1176] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x12, [sp, #432] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #352] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #368] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    str x1, [sp, #232] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x1000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x13, [sp, #232] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #224] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x2000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    stp x2, x1, [sp, #280] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x4000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x12, [sp, #280] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #344] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #224] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #272] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x8000000
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x10000000
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #1184] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x11, [sp, #288] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x11, x12, x11
+; NEON-AES-NEXT:    ldr x12, [sp, #272] // 8-byte Reload
+; NEON-AES-NEXT:    mul x13, x0, x13
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    stp x1, x2, [sp, #304] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x20000000
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x40000000
+; NEON-AES-NEXT:    ldr x10, [sp, #312] // 8-byte Reload
+; NEON-AES-NEXT:    str x13, [sp, #1184] // 8-byte Spill
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    stp x1, x2, [sp, #168] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x80000000
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x100000000
+; NEON-AES-NEXT:    ldp x13, x12, [sp, #168] // 16-byte Folded Reload
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #1192] // 8-byte Reload
+; NEON-AES-NEXT:    mul x13, x0, x13
+; NEON-AES-NEXT:    stp x1, x2, [sp, #200] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x200000000
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x400000000
+; NEON-AES-NEXT:    ldr x11, [sp, #208] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x11, x12, x11
+; NEON-AES-NEXT:    ldr x12, [sp, #304] // 8-byte Reload
+; NEON-AES-NEXT:    str x13, [sp, #1192] // 8-byte Spill
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #200] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x9, x10
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1200] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #216] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x800000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x10, [sp, #216] // 8-byte Reload
+; NEON-AES-NEXT:    mul x12, x0, x12
+; NEON-AES-NEXT:    stp x2, x1, [sp, #256] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x1000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x9, [sp, #256] // 8-byte Reload
+; NEON-AES-NEXT:    str x12, [sp, #1200] // 8-byte Spill
+; NEON-AES-NEXT:    eor x9, x11, x9
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    str x1, [sp, #336] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x2000000000
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x4000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    str x1, [sp, #136] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x8000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x11, [sp, #136] // 8-byte Reload
+; NEON-AES-NEXT:    stp x1, x2, [sp, #152] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x10000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x10, [sp, #160] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1208] // 8-byte Reload
+; NEON-AES-NEXT:    mul x11, x0, x11
+; NEON-AES-NEXT:    str x1, [sp, #144] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x20000000000
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x40000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    str x11, [sp, #1208] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x11, [sp, #264] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #152] // 8-byte Reload
+; NEON-AES-NEXT:    stp x1, x2, [sp, #184] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x80000000000
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    mul x2, x0, x1
+; NEON-AES-NEXT:    and x1, x3, #0x100000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    stp x1, x2, [sp, #240] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x1, x3, #0x200000000000
+; NEON-AES-NEXT:    and x2, x3, #0x1000000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    mul x2, x0, x2
+; NEON-AES-NEXT:    str x1, [sp, #296] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x400000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    str x1, [sp, #40] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x800000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x11, [sp, #40] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #16] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x2000000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x12, [sp, #16] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #144] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x2
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1216] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #56] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x4000000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    mul x23, x0, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #336] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #192] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #48] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x8000000000000
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #56] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x9, [sp, #248] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #184] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #48] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #96] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x10000000000000
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x10, [sp, #96] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1224] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    ldr x11, [sp, #240] // 8-byte Reload
+; NEON-AES-NEXT:    mul x24, x0, x12
+; NEON-AES-NEXT:    eor x9, x9, x11
+; NEON-AES-NEXT:    str x1, [sp, #88] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x20000000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x11, [sp, #88] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1232] // 8-byte Reload
+; NEON-AES-NEXT:    mul x25, x0, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #296] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #112] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x40000000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x9, x9, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #112] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    str x1, [sp, #104] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x80000000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    str x1, [sp, #120] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x100000000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x9, [sp, #120] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #80] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x200000000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x11, [sp, #80] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #32] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x400000000000000
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x12, [sp, #32] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #104] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x12
+; NEON-AES-NEXT:    str x1, [sp, #24] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x800000000000000
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    ldr x12, [sp, #24] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x8, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1248] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1240] // 8-byte Reload
+; NEON-AES-NEXT:    mul x27, x0, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1944] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #72] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x1000000000000000
+; NEON-AES-NEXT:    mul x26, x0, x12
+; NEON-AES-NEXT:    ldr x10, [sp, #72] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x12, [sp, #1904] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    ldr x11, [sp, #1920] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #64] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x2000000000000000
+; NEON-AES-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x8, x10, x8
+; NEON-AES-NEXT:    ldr x10, [sp, #1936] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    ldr x10, [sp, #1928] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #128] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x4000000000000000
+; NEON-AES-NEXT:    eor x10, x10, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1912] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1888] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1792] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x11, x10
+; NEON-AES-NEXT:    ldr x11, [sp, #1256] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1880] // 8-byte Reload
+; NEON-AES-NEXT:    mul x28, x0, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #128] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #1056] // 8-byte Spill
+; NEON-AES-NEXT:    and x1, x3, #0x8000000000000000
+; NEON-AES-NEXT:    eor x14, x8, x11
+; NEON-AES-NEXT:    ldr x8, [sp, #1896] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x11, [sp, #1832] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x0, x1
+; NEON-AES-NEXT:    eor x10, x8, x10
+; NEON-AES-NEXT:    ldr x8, [sp, #1864] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1856] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x8, x11
+; NEON-AES-NEXT:    ldr x8, [sp, #1264] // 8-byte Reload
+; NEON-AES-NEXT:    str x1, [sp, #1080] // 8-byte Spill
+; NEON-AES-NEXT:    mul x1, x0, x7
+; NEON-AES-NEXT:    mul x29, x0, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1872] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1840] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    str x1, [sp, #1136] // 8-byte Spill
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1816] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x8, x12
+; NEON-AES-NEXT:    ldr x8, [sp, #1824] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1784] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1272] // 8-byte Reload
+; NEON-AES-NEXT:    mul x30, x0, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1848] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1808] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    eor x11, x12, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1776] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x12, [sp, #1728] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1768] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x8, x12
+; NEON-AES-NEXT:    ldr x8, [sp, #1280] // 8-byte Reload
+; NEON-AES-NEXT:    mul x22, x0, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1800] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1760] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    eor x11, x12, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1720] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x12, [sp, #1656] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1696] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x8, x12
+; NEON-AES-NEXT:    ldr x8, [sp, #1288] // 8-byte Reload
+; NEON-AES-NEXT:    mul x20, x0, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1752] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1688] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x12, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1736] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x12, [sp, #1608] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1672] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1296] // 8-byte Reload
+; NEON-AES-NEXT:    mul x19, x0, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1744] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1680] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1640] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x8, x12
+; NEON-AES-NEXT:    ldr x8, [sp, #1664] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1584] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1304] // 8-byte Reload
+; NEON-AES-NEXT:    mul x7, x0, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1704] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1632] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x12, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1600] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x12, [sp, #1520] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1568] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x8, x12
+; NEON-AES-NEXT:    ldr x8, [sp, #1312] // 8-byte Reload
+; NEON-AES-NEXT:    mul x6, x0, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1712] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1616] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x9, x10
+; NEON-AES-NEXT:    ldr x9, [sp, #1448] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1536] // 8-byte Reload
+; NEON-AES-NEXT:    eor x6, x7, x6
+; NEON-AES-NEXT:    ldr x7, [sp, #1424] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1592] // 8-byte Reload
+; NEON-AES-NEXT:    mul x7, x0, x7
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1512] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1320] // 8-byte Reload
+; NEON-AES-NEXT:    mul x5, x0, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1624] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1560] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x12, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1504] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1480] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1328] // 8-byte Reload
+; NEON-AES-NEXT:    mul x4, x0, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1648] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x10, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1552] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1576] // 8-byte Reload
+; NEON-AES-NEXT:    eor x11, x11, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1472] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1528] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x11, x9
+; NEON-AES-NEXT:    ldr x11, [sp, #1456] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1488] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1336] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1544] // 8-byte Reload
+; NEON-AES-NEXT:    mul x3, x0, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #992] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1464] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x13, x9
+; NEON-AES-NEXT:    eor x8, x8, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1344] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x0, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1056] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1496] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1080] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x9, x8
+; NEON-AES-NEXT:    ldr x9, [sp, #1352] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x14, x10
+; NEON-AES-NEXT:    eor x14, x21, x16
+; NEON-AES-NEXT:    rbit x8, x8
+; NEON-AES-NEXT:    mul x18, x0, x9
+; NEON-AES-NEXT:    eor x13, x15, x13
+; NEON-AES-NEXT:    ldr x9, [sp, #1360] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x10, [sp, #1112] // 8-byte Reload
+; NEON-AES-NEXT:    mul x17, x0, x9
+; NEON-AES-NEXT:    eor x2, x13, x8, lsr #1
+; NEON-AES-NEXT:    ldr x8, [sp, #1136] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x9, [sp, #1128] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x9, x8
+; NEON-AES-NEXT:    ldr x9, [sp, #1120] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x9, x10
+; NEON-AES-NEXT:    ldr x9, [sp, #1096] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x10, [sp, #1072] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x8, x13
+; NEON-AES-NEXT:    ldr x8, [sp, #1104] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x9, x10
+; NEON-AES-NEXT:    ldr x9, [sp, #1368] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x10, [sp, #984] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1064] // 8-byte Reload
+; NEON-AES-NEXT:    mul x21, x0, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1048] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1040] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    ldr x9, [sp, #1000] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1376] // 8-byte Reload
+; NEON-AES-NEXT:    mul x16, x0, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1088] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1024] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x13, x8
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1016] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1008] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x10, x11
+; NEON-AES-NEXT:    ldr x10, [sp, #1384] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x11, [sp, #936] // 8-byte Reload
+; NEON-AES-NEXT:    mul x15, x0, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1032] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #976] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x14, x10
+; NEON-AES-NEXT:    eor x10, x8, x9
+; NEON-AES-NEXT:    ldr x8, [sp, #968] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x13, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #1392] // 8-byte Reload
+; NEON-AES-NEXT:    mul x13, x0, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #960] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #944] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x8, x11
+; NEON-AES-NEXT:    ldr x8, [sp, #1168] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x11, [sp, #1176] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x15, x13
+; NEON-AES-NEXT:    eor x8, x8, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #928] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1184] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1400] // 8-byte Reload
+; NEON-AES-NEXT:    mul x12, x0, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #952] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #920] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x10, x9
+; NEON-AES-NEXT:    ldr x10, [sp, #1152] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1192] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x13, x12
+; NEON-AES-NEXT:    ldr x13, [sp, #1440] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1144] // 8-byte Reload
+; NEON-AES-NEXT:    mul x13, x0, x13
+; NEON-AES-NEXT:    eor x14, x14, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1200] // 8-byte Reload
+; NEON-AES-NEXT:    eor x10, x14, x10
+; NEON-AES-NEXT:    ldr x14, [sp, #1208] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1408] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1160] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x23
+; NEON-AES-NEXT:    mul x11, x0, x11
+; NEON-AES-NEXT:    eor x10, x10, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1416] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x24
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    eor x10, x25, x26
+; NEON-AES-NEXT:    eor x10, x10, x27
+; NEON-AES-NEXT:    eor x8, x9, x8
+; NEON-AES-NEXT:    mul x14, x0, x14
+; NEON-AES-NEXT:    eor x9, x10, x28
+; NEON-AES-NEXT:    eor x10, x6, x5
+; NEON-AES-NEXT:    eor x10, x10, x4
+; NEON-AES-NEXT:    ldr x4, [sp, #1432] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x29
+; NEON-AES-NEXT:    eor x10, x10, x3
+; NEON-AES-NEXT:    eor x9, x9, x30
+; NEON-AES-NEXT:    eor x11, x12, x11
+; NEON-AES-NEXT:    mul x4, x0, x4
+; NEON-AES-NEXT:    eor x10, x10, x1
+; NEON-AES-NEXT:    eor x9, x9, x22
+; NEON-AES-NEXT:    eor x10, x10, x18
+; NEON-AES-NEXT:    eor x9, x9, x20
+; NEON-AES-NEXT:    eor x11, x11, x14
+; NEON-AES-NEXT:    eor x10, x10, x17
+; NEON-AES-NEXT:    eor x9, x9, x19
+; NEON-AES-NEXT:    eor x11, x11, x7
+; NEON-AES-NEXT:    eor x10, x10, x21
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    mov x1, x2
+; NEON-AES-NEXT:    eor x9, x10, x16
+; NEON-AES-NEXT:    eor x10, x11, x4
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    eor x9, x10, x13
+; NEON-AES-NEXT:    eor x0, x8, x9
+; NEON-AES-NEXT:    add sp, sp, #1952
+; NEON-AES-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NEON-AES-NEXT:    ret
+  %a = call <1 x i128> @llvm.clmul.v1i128(<1 x i128> %x, <1 x i128> %y)
+  ret <1 x i128> %a
+}
 
-; TODO: Fix: lower with PMULL and PMULL2 .d variants which require FEAT_PMULL
-; define <1 x i64> @clmul_v1i64_neon_aes(<1 x i64> %x, <1 x i64> %y) #0 {
-;   %a = call <1 x i64> @llvm.clmul.v1i64(<1 x i64> %x, <1 x i64> %y)
-;   ret <1 x i64> %a
-; }
+define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
+; NEON-LABEL: clmul_v8i16_neon_zext:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v2.8h, #2
+; NEON-NEXT:    movi v3.8h, #1
+; NEON-NEXT:    movi v4.8h, #4
+; NEON-NEXT:    movi v5.8h, #8
+; NEON-NEXT:    movi v6.8h, #16
+; NEON-NEXT:    movi v7.8h, #32
+; NEON-NEXT:    ushll v1.8h, v1.8b, #0
+; NEON-NEXT:    movi v16.8h, #64
+; NEON-NEXT:    movi v17.8h, #128
+; NEON-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-NEXT:    and v1.16b, v1.16b, v17.16b
+; NEON-NEXT:    xtn v2.8b, v2.8h
+; NEON-NEXT:    xtn v3.8b, v3.8h
+; NEON-NEXT:    xtn v4.8b, v4.8h
+; NEON-NEXT:    xtn v5.8b, v5.8h
+; NEON-NEXT:    xtn v6.8b, v6.8h
+; NEON-NEXT:    xtn v7.8b, v7.8h
+; NEON-NEXT:    xtn v16.8b, v16.8h
+; NEON-NEXT:    xtn v1.8b, v1.8h
+; NEON-NEXT:    umull v2.8h, v0.8b, v2.8b
+; NEON-NEXT:    umull v3.8h, v0.8b, v3.8b
+; NEON-NEXT:    umull v4.8h, v0.8b, v4.8b
+; NEON-NEXT:    umull v5.8h, v0.8b, v5.8b
+; NEON-NEXT:    umull v6.8h, v0.8b, v6.8b
+; NEON-NEXT:    umull v7.8h, v0.8b, v7.8b
+; NEON-NEXT:    umull v16.8h, v0.8b, v16.8b
+; NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
+; NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-NEXT:    eor v4.16b, v6.16b, v7.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v3.16b, v4.16b, v16.16b
+; NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v8i16_neon_zext:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    movi v2.8h, #2
+; NEON-AES-NEXT:    movi v3.8h, #1
+; NEON-AES-NEXT:    movi v4.8h, #4
+; NEON-AES-NEXT:    movi v5.8h, #8
+; NEON-AES-NEXT:    movi v6.8h, #16
+; NEON-AES-NEXT:    movi v7.8h, #32
+; NEON-AES-NEXT:    ushll v1.8h, v1.8b, #0
+; NEON-AES-NEXT:    movi v16.8h, #64
+; NEON-AES-NEXT:    movi v17.8h, #128
+; NEON-AES-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-AES-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-AES-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-AES-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-AES-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    and v1.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    xtn v2.8b, v2.8h
+; NEON-AES-NEXT:    xtn v3.8b, v3.8h
+; NEON-AES-NEXT:    xtn v4.8b, v4.8h
+; NEON-AES-NEXT:    xtn v5.8b, v5.8h
+; NEON-AES-NEXT:    xtn v6.8b, v6.8h
+; NEON-AES-NEXT:    xtn v7.8b, v7.8h
+; NEON-AES-NEXT:    xtn v16.8b, v16.8h
+; NEON-AES-NEXT:    xtn v1.8b, v1.8h
+; NEON-AES-NEXT:    umull v2.8h, v0.8b, v2.8b
+; NEON-AES-NEXT:    umull v3.8h, v0.8b, v3.8b
+; NEON-AES-NEXT:    umull v4.8h, v0.8b, v4.8b
+; NEON-AES-NEXT:    umull v5.8h, v0.8b, v5.8b
+; NEON-AES-NEXT:    umull v6.8h, v0.8b, v6.8b
+; NEON-AES-NEXT:    umull v7.8h, v0.8b, v7.8b
+; NEON-AES-NEXT:    umull v16.8h, v0.8b, v16.8b
+; NEON-AES-NEXT:    umull v0.8h, v0.8b, v1.8b
+; NEON-AES-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    eor v4.16b, v6.16b, v7.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v16.16b
+; NEON-AES-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-AES-NEXT:    ret
+  %zextx = zext <8 x i8> %x to <8 x i16>
+  %zexty = zext <8 x i8> %y to <8 x i16>
+  %a = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %zextx, <8 x i16> %zexty)
+  ret <8 x i16> %a
+}
+
+define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
+; NEON-LABEL: clmul_v16i16_neon_zext:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v4.8h, #2
+; NEON-NEXT:    ushll v2.8h, v1.8b, #0
+; NEON-NEXT:    movi v5.8h, #1
+; NEON-NEXT:    movi v6.8h, #4
+; NEON-NEXT:    movi v7.8h, #8
+; NEON-NEXT:    movi v17.8h, #16
+; NEON-NEXT:    ushll2 v3.8h, v1.16b, #0
+; NEON-NEXT:    movi v18.8h, #32
+; NEON-NEXT:    movi v1.8h, #128
+; NEON-NEXT:    movi v19.8h, #64
+; NEON-NEXT:    movi v25.2d, #0000000000000000
+; NEON-NEXT:    and v16.16b, v2.16b, v4.16b
+; NEON-NEXT:    and v20.16b, v2.16b, v5.16b
+; NEON-NEXT:    and v21.16b, v2.16b, v6.16b
+; NEON-NEXT:    and v22.16b, v2.16b, v7.16b
+; NEON-NEXT:    and v4.16b, v3.16b, v4.16b
+; NEON-NEXT:    and v5.16b, v3.16b, v5.16b
+; NEON-NEXT:    and v6.16b, v3.16b, v6.16b
+; NEON-NEXT:    and v7.16b, v3.16b, v7.16b
+; NEON-NEXT:    and v23.16b, v3.16b, v17.16b
+; NEON-NEXT:    and v24.16b, v3.16b, v18.16b
+; NEON-NEXT:    and v26.16b, v3.16b, v1.16b
+; NEON-NEXT:    and v17.16b, v2.16b, v17.16b
+; NEON-NEXT:    and v18.16b, v2.16b, v18.16b
+; NEON-NEXT:    uzp1 v4.16b, v0.16b, v4.16b
+; NEON-NEXT:    uzp1 v5.16b, v0.16b, v5.16b
+; NEON-NEXT:    uzp1 v6.16b, v0.16b, v6.16b
+; NEON-NEXT:    uzp1 v7.16b, v0.16b, v7.16b
+; NEON-NEXT:    uzp1 v23.16b, v0.16b, v23.16b
+; NEON-NEXT:    uzp1 v24.16b, v0.16b, v24.16b
+; NEON-NEXT:    and v3.16b, v3.16b, v19.16b
+; NEON-NEXT:    uzp1 v26.16b, v0.16b, v26.16b
+; NEON-NEXT:    uzp1 v25.16b, v0.16b, v25.16b
+; NEON-NEXT:    xtn v16.8b, v16.8h
+; NEON-NEXT:    xtn v20.8b, v20.8h
+; NEON-NEXT:    xtn v21.8b, v21.8h
+; NEON-NEXT:    xtn v22.8b, v22.8h
+; NEON-NEXT:    xtn v17.8b, v17.8h
+; NEON-NEXT:    xtn v18.8b, v18.8h
+; NEON-NEXT:    and v19.16b, v2.16b, v19.16b
+; NEON-NEXT:    uzp1 v3.16b, v0.16b, v3.16b
+; NEON-NEXT:    umull2 v4.8h, v0.16b, v4.16b
+; NEON-NEXT:    umull2 v5.8h, v0.16b, v5.16b
+; NEON-NEXT:    umull2 v6.8h, v0.16b, v6.16b
+; NEON-NEXT:    umull2 v7.8h, v0.16b, v7.16b
+; NEON-NEXT:    umull2 v23.8h, v0.16b, v23.16b
+; NEON-NEXT:    umull2 v24.8h, v0.16b, v24.16b
+; NEON-NEXT:    umull2 v26.8h, v0.16b, v26.16b
+; NEON-NEXT:    umull2 v25.8h, v0.16b, v25.16b
+; NEON-NEXT:    xtn v19.8b, v19.8h
+; NEON-NEXT:    umull v16.8h, v0.8b, v16.8b
+; NEON-NEXT:    umull v20.8h, v0.8b, v20.8b
+; NEON-NEXT:    umull v21.8h, v0.8b, v21.8b
+; NEON-NEXT:    umull v22.8h, v0.8b, v22.8b
+; NEON-NEXT:    umull v17.8h, v0.8b, v17.8b
+; NEON-NEXT:    umull v18.8h, v0.8b, v18.8b
+; NEON-NEXT:    umull2 v3.8h, v0.16b, v3.16b
+; NEON-NEXT:    eor v4.16b, v5.16b, v4.16b
+; NEON-NEXT:    eor v5.16b, v6.16b, v7.16b
+; NEON-NEXT:    eor v6.16b, v23.16b, v24.16b
+; NEON-NEXT:    eor v7.16b, v26.16b, v25.16b
+; NEON-NEXT:    eor v23.16b, v25.16b, v25.16b
+; NEON-NEXT:    and v1.16b, v2.16b, v1.16b
+; NEON-NEXT:    umull v2.8h, v0.8b, v19.8b
+; NEON-NEXT:    eor v16.16b, v20.16b, v16.16b
+; NEON-NEXT:    eor v19.16b, v21.16b, v22.16b
+; NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
+; NEON-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-NEXT:    eor v3.16b, v6.16b, v3.16b
+; NEON-NEXT:    eor v5.16b, v7.16b, v25.16b
+; NEON-NEXT:    eor v6.16b, v23.16b, v25.16b
+; NEON-NEXT:    xtn v1.8b, v1.8h
+; NEON-NEXT:    eor v7.16b, v16.16b, v19.16b
+; NEON-NEXT:    eor v2.16b, v17.16b, v2.16b
+; NEON-NEXT:    eor v3.16b, v4.16b, v3.16b
+; NEON-NEXT:    eor v4.16b, v5.16b, v25.16b
+; NEON-NEXT:    eor v5.16b, v6.16b, v25.16b
+; NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
+; NEON-NEXT:    eor v1.16b, v7.16b, v2.16b
+; NEON-NEXT:    eor v2.16b, v3.16b, v4.16b
+; NEON-NEXT:    eor v3.16b, v5.16b, v25.16b
+; NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v16i16_neon_zext:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    movi v4.8h, #2
+; NEON-AES-NEXT:    ushll v2.8h, v1.8b, #0
+; NEON-AES-NEXT:    movi v5.8h, #1
+; NEON-AES-NEXT:    movi v6.8h, #4
+; NEON-AES-NEXT:    movi v7.8h, #8
+; NEON-AES-NEXT:    movi v17.8h, #16
+; NEON-AES-NEXT:    ushll2 v3.8h, v1.16b, #0
+; NEON-AES-NEXT:    movi v18.8h, #32
+; NEON-AES-NEXT:    movi v1.8h, #128
+; NEON-AES-NEXT:    movi v19.8h, #64
+; NEON-AES-NEXT:    movi v25.2d, #0000000000000000
+; NEON-AES-NEXT:    and v16.16b, v2.16b, v4.16b
+; NEON-AES-NEXT:    and v20.16b, v2.16b, v5.16b
+; NEON-AES-NEXT:    and v21.16b, v2.16b, v6.16b
+; NEON-AES-NEXT:    and v22.16b, v2.16b, v7.16b
+; NEON-AES-NEXT:    and v4.16b, v3.16b, v4.16b
+; NEON-AES-NEXT:    and v5.16b, v3.16b, v5.16b
+; NEON-AES-NEXT:    and v6.16b, v3.16b, v6.16b
+; NEON-AES-NEXT:    and v7.16b, v3.16b, v7.16b
+; NEON-AES-NEXT:    and v23.16b, v3.16b, v17.16b
+; NEON-AES-NEXT:    and v24.16b, v3.16b, v18.16b
+; NEON-AES-NEXT:    and v26.16b, v3.16b, v1.16b
+; NEON-AES-NEXT:    and v17.16b, v2.16b, v17.16b
+; NEON-AES-NEXT:    and v18.16b, v2.16b, v18.16b
+; NEON-AES-NEXT:    uzp1 v4.16b, v0.16b, v4.16b
+; NEON-AES-NEXT:    uzp1 v5.16b, v0.16b, v5.16b
+; NEON-AES-NEXT:    uzp1 v6.16b, v0.16b, v6.16b
+; NEON-AES-NEXT:    uzp1 v7.16b, v0.16b, v7.16b
+; NEON-AES-NEXT:    uzp1 v23.16b, v0.16b, v23.16b
+; NEON-AES-NEXT:    uzp1 v24.16b, v0.16b, v24.16b
+; NEON-AES-NEXT:    and v3.16b, v3.16b, v19.16b
+; NEON-AES-NEXT:    uzp1 v26.16b, v0.16b, v26.16b
+; NEON-AES-NEXT:    uzp1 v25.16b, v0.16b, v25.16b
+; NEON-AES-NEXT:    xtn v16.8b, v16.8h
+; NEON-AES-NEXT:    xtn v20.8b, v20.8h
+; NEON-AES-NEXT:    xtn v21.8b, v21.8h
+; NEON-AES-NEXT:    xtn v22.8b, v22.8h
+; NEON-AES-NEXT:    xtn v17.8b, v17.8h
+; NEON-AES-NEXT:    xtn v18.8b, v18.8h
+; NEON-AES-NEXT:    and v19.16b, v2.16b, v19.16b
+; NEON-AES-NEXT:    uzp1 v3.16b, v0.16b, v3.16b
+; NEON-AES-NEXT:    umull2 v4.8h, v0.16b, v4.16b
+; NEON-AES-NEXT:    umull2 v5.8h, v0.16b, v5.16b
+; NEON-AES-NEXT:    umull2 v6.8h, v0.16b, v6.16b
+; NEON-AES-NEXT:    umull2 v7.8h, v0.16b, v7.16b
+; NEON-AES-NEXT:    umull2 v23.8h, v0.16b, v23.16b
+; NEON-AES-NEXT:    umull2 v24.8h, v0.16b, v24.16b
+; NEON-AES-NEXT:    umull2 v26.8h, v0.16b, v26.16b
+; NEON-AES-NEXT:    umull2 v25.8h, v0.16b, v25.16b
+; NEON-AES-NEXT:    xtn v19.8b, v19.8h
+; NEON-AES-NEXT:    umull v16.8h, v0.8b, v16.8b
+; NEON-AES-NEXT:    umull v20.8h, v0.8b, v20.8b
+; NEON-AES-NEXT:    umull v21.8h, v0.8b, v21.8b
+; NEON-AES-NEXT:    umull v22.8h, v0.8b, v22.8b
+; NEON-AES-NEXT:    umull v17.8h, v0.8b, v17.8b
+; NEON-AES-NEXT:    umull v18.8h, v0.8b, v18.8b
+; NEON-AES-NEXT:    umull2 v3.8h, v0.16b, v3.16b
+; NEON-AES-NEXT:    eor v4.16b, v5.16b, v4.16b
+; NEON-AES-NEXT:    eor v5.16b, v6.16b, v7.16b
+; NEON-AES-NEXT:    eor v6.16b, v23.16b, v24.16b
+; NEON-AES-NEXT:    eor v7.16b, v26.16b, v25.16b
+; NEON-AES-NEXT:    eor v23.16b, v25.16b, v25.16b
+; NEON-AES-NEXT:    and v1.16b, v2.16b, v1.16b
+; NEON-AES-NEXT:    umull v2.8h, v0.8b, v19.8b
+; NEON-AES-NEXT:    eor v16.16b, v20.16b, v16.16b
+; NEON-AES-NEXT:    eor v19.16b, v21.16b, v22.16b
+; NEON-AES-NEXT:    eor v17.16b, v17.16b, v18.16b
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    eor v3.16b, v6.16b, v3.16b
+; NEON-AES-NEXT:    eor v5.16b, v7.16b, v25.16b
+; NEON-AES-NEXT:    eor v6.16b, v23.16b, v25.16b
+; NEON-AES-NEXT:    xtn v1.8b, v1.8h
+; NEON-AES-NEXT:    eor v7.16b, v16.16b, v19.16b
+; NEON-AES-NEXT:    eor v2.16b, v17.16b, v2.16b
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    eor v4.16b, v5.16b, v25.16b
+; NEON-AES-NEXT:    eor v5.16b, v6.16b, v25.16b
+; NEON-AES-NEXT:    umull v0.8h, v0.8b, v1.8b
+; NEON-AES-NEXT:    eor v1.16b, v7.16b, v2.16b
+; NEON-AES-NEXT:    eor v2.16b, v3.16b, v4.16b
+; NEON-AES-NEXT:    eor v3.16b, v5.16b, v25.16b
+; NEON-AES-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-AES-NEXT:    eor v1.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    ret
+  %zextx = zext <16 x i8> %x to <16 x i16>
+  %zexty = zext <16 x i8> %y to <16 x i16>
+  %a = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %zextx, <16 x i16> %zexty)
+  ret <16 x i16> %a
+}
+
+define <4 x i32> @clmul_v4i32_neon_zext(<4 x i16> %x, <4 x i16> %y) {
+; NEON-LABEL: clmul_v4i32_neon_zext:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v2.4s, #2
+; NEON-NEXT:    movi v3.4s, #1
+; NEON-NEXT:    movi v4.4s, #4
+; NEON-NEXT:    movi v5.4s, #8
+; NEON-NEXT:    ushll v1.4s, v1.4h, #0
+; NEON-NEXT:    movi v6.4s, #16
+; NEON-NEXT:    movi v7.4s, #32
+; NEON-NEXT:    movi v16.4s, #128
+; NEON-NEXT:    movi v17.4s, #1, lsl #8
+; NEON-NEXT:    movi v18.4s, #8, lsl #8
+; NEON-NEXT:    movi v19.4s, #16, lsl #8
+; NEON-NEXT:    movi v20.4s, #64
+; NEON-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-NEXT:    movi v21.4s, #2, lsl #8
+; NEON-NEXT:    movi v22.4s, #32, lsl #8
+; NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-NEXT:    xtn v2.4h, v2.4s
+; NEON-NEXT:    xtn v3.4h, v3.4s
+; NEON-NEXT:    xtn v4.4h, v4.4s
+; NEON-NEXT:    xtn v5.4h, v5.4s
+; NEON-NEXT:    movi v23.4s, #4, lsl #8
+; NEON-NEXT:    movi v24.4s, #64, lsl #8
+; NEON-NEXT:    xtn v6.4h, v6.4s
+; NEON-NEXT:    xtn v7.4h, v7.4s
+; NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-NEXT:    xtn v16.4h, v16.4s
+; NEON-NEXT:    xtn v17.4h, v17.4s
+; NEON-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-NEXT:    xtn v18.4h, v18.4s
+; NEON-NEXT:    xtn v19.4h, v19.4s
+; NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; NEON-NEXT:    umull v2.4s, v0.4h, v2.4h
+; NEON-NEXT:    umull v3.4s, v0.4h, v3.4h
+; NEON-NEXT:    umull v4.4s, v0.4h, v4.4h
+; NEON-NEXT:    umull v5.4s, v0.4h, v5.4h
+; NEON-NEXT:    movi v25.4s, #128, lsl #8
+; NEON-NEXT:    xtn v20.4h, v20.4s
+; NEON-NEXT:    xtn v21.4h, v21.4s
+; NEON-NEXT:    and v23.16b, v1.16b, v23.16b
+; NEON-NEXT:    xtn v22.4h, v22.4s
+; NEON-NEXT:    and v24.16b, v1.16b, v24.16b
+; NEON-NEXT:    umull v6.4s, v0.4h, v6.4h
+; NEON-NEXT:    umull v7.4s, v0.4h, v7.4h
+; NEON-NEXT:    umull v16.4s, v0.4h, v16.4h
+; NEON-NEXT:    umull v17.4s, v0.4h, v17.4h
+; NEON-NEXT:    umull v18.4s, v0.4h, v18.4h
+; NEON-NEXT:    umull v19.4s, v0.4h, v19.4h
+; NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-NEXT:    and v1.16b, v1.16b, v25.16b
+; NEON-NEXT:    xtn v4.4h, v23.4s
+; NEON-NEXT:    xtn v5.4h, v24.4s
+; NEON-NEXT:    umull v20.4s, v0.4h, v20.4h
+; NEON-NEXT:    umull v21.4s, v0.4h, v21.4h
+; NEON-NEXT:    umull v22.4s, v0.4h, v22.4h
+; NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-NEXT:    eor v7.16b, v16.16b, v17.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v16.16b, v18.16b, v19.16b
+; NEON-NEXT:    xtn v1.4h, v1.4s
+; NEON-NEXT:    umull v3.4s, v0.4h, v4.4h
+; NEON-NEXT:    umull v4.4s, v0.4h, v5.4h
+; NEON-NEXT:    eor v5.16b, v6.16b, v20.16b
+; NEON-NEXT:    eor v6.16b, v7.16b, v21.16b
+; NEON-NEXT:    eor v7.16b, v16.16b, v22.16b
+; NEON-NEXT:    umull v0.4s, v0.4h, v1.4h
+; NEON-NEXT:    eor v1.16b, v2.16b, v5.16b
+; NEON-NEXT:    eor v2.16b, v6.16b, v3.16b
+; NEON-NEXT:    eor v3.16b, v7.16b, v4.16b
+; NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
+; NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v4i32_neon_zext:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    movi v2.4s, #2
+; NEON-AES-NEXT:    movi v3.4s, #1
+; NEON-AES-NEXT:    movi v4.4s, #4
+; NEON-AES-NEXT:    movi v5.4s, #8
+; NEON-AES-NEXT:    ushll v1.4s, v1.4h, #0
+; NEON-AES-NEXT:    movi v6.4s, #16
+; NEON-AES-NEXT:    movi v7.4s, #32
+; NEON-AES-NEXT:    movi v16.4s, #128
+; NEON-AES-NEXT:    movi v17.4s, #1, lsl #8
+; NEON-AES-NEXT:    movi v18.4s, #8, lsl #8
+; NEON-AES-NEXT:    movi v19.4s, #16, lsl #8
+; NEON-AES-NEXT:    movi v20.4s, #64
+; NEON-AES-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-AES-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-AES-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-AES-NEXT:    movi v21.4s, #2, lsl #8
+; NEON-AES-NEXT:    movi v22.4s, #32, lsl #8
+; NEON-AES-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-AES-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-AES-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-AES-NEXT:    xtn v2.4h, v2.4s
+; NEON-AES-NEXT:    xtn v3.4h, v3.4s
+; NEON-AES-NEXT:    xtn v4.4h, v4.4s
+; NEON-AES-NEXT:    xtn v5.4h, v5.4s
+; NEON-AES-NEXT:    movi v23.4s, #4, lsl #8
+; NEON-AES-NEXT:    movi v24.4s, #64, lsl #8
+; NEON-AES-NEXT:    xtn v6.4h, v6.4s
+; NEON-AES-NEXT:    xtn v7.4h, v7.4s
+; NEON-AES-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    xtn v16.4h, v16.4s
+; NEON-AES-NEXT:    xtn v17.4h, v17.4s
+; NEON-AES-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-AES-NEXT:    xtn v18.4h, v18.4s
+; NEON-AES-NEXT:    xtn v19.4h, v19.4s
+; NEON-AES-NEXT:    and v22.16b, v1.16b, v22.16b
+; NEON-AES-NEXT:    umull v2.4s, v0.4h, v2.4h
+; NEON-AES-NEXT:    umull v3.4s, v0.4h, v3.4h
+; NEON-AES-NEXT:    umull v4.4s, v0.4h, v4.4h
+; NEON-AES-NEXT:    umull v5.4s, v0.4h, v5.4h
+; NEON-AES-NEXT:    movi v25.4s, #128, lsl #8
+; NEON-AES-NEXT:    xtn v20.4h, v20.4s
+; NEON-AES-NEXT:    xtn v21.4h, v21.4s
+; NEON-AES-NEXT:    and v23.16b, v1.16b, v23.16b
+; NEON-AES-NEXT:    xtn v22.4h, v22.4s
+; NEON-AES-NEXT:    and v24.16b, v1.16b, v24.16b
+; NEON-AES-NEXT:    umull v6.4s, v0.4h, v6.4h
+; NEON-AES-NEXT:    umull v7.4s, v0.4h, v7.4h
+; NEON-AES-NEXT:    umull v16.4s, v0.4h, v16.4h
+; NEON-AES-NEXT:    umull v17.4s, v0.4h, v17.4h
+; NEON-AES-NEXT:    umull v18.4s, v0.4h, v18.4h
+; NEON-AES-NEXT:    umull v19.4s, v0.4h, v19.4h
+; NEON-AES-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    and v1.16b, v1.16b, v25.16b
+; NEON-AES-NEXT:    xtn v4.4h, v23.4s
+; NEON-AES-NEXT:    xtn v5.4h, v24.4s
+; NEON-AES-NEXT:    umull v20.4s, v0.4h, v20.4h
+; NEON-AES-NEXT:    umull v21.4s, v0.4h, v21.4h
+; NEON-AES-NEXT:    umull v22.4s, v0.4h, v22.4h
+; NEON-AES-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-AES-NEXT:    eor v7.16b, v16.16b, v17.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v16.16b, v18.16b, v19.16b
+; NEON-AES-NEXT:    xtn v1.4h, v1.4s
+; NEON-AES-NEXT:    umull v3.4s, v0.4h, v4.4h
+; NEON-AES-NEXT:    umull v4.4s, v0.4h, v5.4h
+; NEON-AES-NEXT:    eor v5.16b, v6.16b, v20.16b
+; NEON-AES-NEXT:    eor v6.16b, v7.16b, v21.16b
+; NEON-AES-NEXT:    eor v7.16b, v16.16b, v22.16b
+; NEON-AES-NEXT:    umull v0.4s, v0.4h, v1.4h
+; NEON-AES-NEXT:    eor v1.16b, v2.16b, v5.16b
+; NEON-AES-NEXT:    eor v2.16b, v6.16b, v3.16b
+; NEON-AES-NEXT:    eor v3.16b, v7.16b, v4.16b
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v2.16b
+; NEON-AES-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-AES-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-AES-NEXT:    ret
+  %zextx = zext <4 x i16> %x to <4 x i32>
+  %zexty = zext <4 x i16> %y to <4 x i32>
+  %a = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %zextx, <4 x i32> %zexty)
+  ret <4 x i32> %a
+}
+
+define <8 x i32> @clmul_v8i32_neon_zext(<8 x i16> %x, <8 x i16> %y) {
+; NEON-LABEL: clmul_v8i32_neon_zext:
+; NEON:       // %bb.0:
+; NEON-NEXT:    stp d9, d8, [sp, #-16]! // 16-byte Folded Spill
+; NEON-NEXT:    .cfi_def_cfa_offset 16
+; NEON-NEXT:    .cfi_offset b8, -8
+; NEON-NEXT:    .cfi_offset b9, -16
+; NEON-NEXT:    movi v4.4s, #2
+; NEON-NEXT:    movi v5.4s, #1
+; NEON-NEXT:    movi v6.4s, #4
+; NEON-NEXT:    ushll v2.4s, v1.4h, #0
+; NEON-NEXT:    movi v3.4s, #8
+; NEON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; NEON-NEXT:    movi v20.4s, #16
+; NEON-NEXT:    movi v21.4s, #32
+; NEON-NEXT:    and v17.16b, v2.16b, v4.16b
+; NEON-NEXT:    and v7.16b, v2.16b, v5.16b
+; NEON-NEXT:    and v16.16b, v2.16b, v6.16b
+; NEON-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-NEXT:    and v18.16b, v1.16b, v3.16b
+; NEON-NEXT:    and v3.16b, v2.16b, v3.16b
+; NEON-NEXT:    and v24.16b, v1.16b, v20.16b
+; NEON-NEXT:    xtn v17.4h, v17.4s
+; NEON-NEXT:    xtn v7.4h, v7.4s
+; NEON-NEXT:    xtn v19.4h, v16.4s
+; NEON-NEXT:    uzp1 v4.8h, v0.8h, v4.8h
+; NEON-NEXT:    uzp1 v22.8h, v0.8h, v5.8h
+; NEON-NEXT:    uzp1 v23.8h, v0.8h, v6.8h
+; NEON-NEXT:    uzp1 v18.8h, v0.8h, v18.8h
+; NEON-NEXT:    and v25.16b, v1.16b, v21.16b
+; NEON-NEXT:    movi v6.4s, #128
+; NEON-NEXT:    uzp1 v24.8h, v0.8h, v24.8h
+; NEON-NEXT:    and v28.16b, v2.16b, v20.16b
+; NEON-NEXT:    and v21.16b, v2.16b, v21.16b
+; NEON-NEXT:    umull v5.4s, v0.4h, v17.4h
+; NEON-NEXT:    umull v16.4s, v0.4h, v7.4h
+; NEON-NEXT:    umull v17.4s, v0.4h, v19.4h
+; NEON-NEXT:    xtn v19.4h, v3.4s
+; NEON-NEXT:    movi v3.4s, #64
+; NEON-NEXT:    movi v7.4s, #1, lsl #8
+; NEON-NEXT:    umull2 v26.4s, v0.8h, v4.8h
+; NEON-NEXT:    umull2 v22.4s, v0.8h, v22.8h
+; NEON-NEXT:    umull2 v23.4s, v0.8h, v23.8h
+; NEON-NEXT:    umull2 v27.4s, v0.8h, v18.8h
+; NEON-NEXT:    uzp1 v25.8h, v0.8h, v25.8h
+; NEON-NEXT:    movi v4.4s, #2, lsl #8
+; NEON-NEXT:    and v30.16b, v1.16b, v6.16b
+; NEON-NEXT:    movi v18.4s, #8, lsl #8
+; NEON-NEXT:    movi v20.4s, #16, lsl #8
+; NEON-NEXT:    and v29.16b, v1.16b, v3.16b
+; NEON-NEXT:    and v31.16b, v1.16b, v7.16b
+; NEON-NEXT:    umull2 v24.4s, v0.8h, v24.8h
+; NEON-NEXT:    eor v22.16b, v22.16b, v26.16b
+; NEON-NEXT:    xtn v28.4h, v28.4s
+; NEON-NEXT:    umull v19.4s, v0.4h, v19.4h
+; NEON-NEXT:    eor v23.16b, v23.16b, v27.16b
+; NEON-NEXT:    umull2 v25.4s, v0.8h, v25.8h
+; NEON-NEXT:    uzp1 v27.8h, v0.8h, v30.8h
+; NEON-NEXT:    uzp1 v26.8h, v0.8h, v29.8h
+; NEON-NEXT:    uzp1 v29.8h, v0.8h, v31.8h
+; NEON-NEXT:    and v30.16b, v1.16b, v4.16b
+; NEON-NEXT:    xtn v31.4h, v21.4s
+; NEON-NEXT:    movi v21.4s, #32, lsl #8
+; NEON-NEXT:    and v8.16b, v1.16b, v20.16b
+; NEON-NEXT:    eor v22.16b, v22.16b, v23.16b
+; NEON-NEXT:    and v23.16b, v1.16b, v18.16b
+; NEON-NEXT:    umull v28.4s, v0.4h, v28.4h
+; NEON-NEXT:    eor v24.16b, v24.16b, v25.16b
+; NEON-NEXT:    umull2 v27.4s, v0.8h, v27.8h
+; NEON-NEXT:    eor v16.16b, v16.16b, v5.16b
+; NEON-NEXT:    umull2 v25.4s, v0.8h, v26.8h
+; NEON-NEXT:    uzp1 v26.8h, v0.8h, v30.8h
+; NEON-NEXT:    umull2 v29.4s, v0.8h, v29.8h
+; NEON-NEXT:    movi v30.2d, #0000000000000000
+; NEON-NEXT:    uzp1 v23.8h, v0.8h, v23.8h
+; NEON-NEXT:    uzp1 v8.8h, v0.8h, v8.8h
+; NEON-NEXT:    and v9.16b, v1.16b, v21.16b
+; NEON-NEXT:    umull v31.4s, v0.4h, v31.4h
+; NEON-NEXT:    eor v17.16b, v17.16b, v19.16b
+; NEON-NEXT:    and v6.16b, v2.16b, v6.16b
+; NEON-NEXT:    and v7.16b, v2.16b, v7.16b
+; NEON-NEXT:    and v18.16b, v2.16b, v18.16b
+; NEON-NEXT:    umull2 v26.4s, v0.8h, v26.8h
+; NEON-NEXT:    eor v27.16b, v27.16b, v29.16b
+; NEON-NEXT:    eor v24.16b, v24.16b, v25.16b
+; NEON-NEXT:    uzp1 v29.8h, v0.8h, v9.8h
+; NEON-NEXT:    uzp1 v30.8h, v0.8h, v30.8h
+; NEON-NEXT:    movi v9.4s, #64, lsl #8
+; NEON-NEXT:    umull2 v23.4s, v0.8h, v23.8h
+; NEON-NEXT:    umull2 v8.4s, v0.8h, v8.8h
+; NEON-NEXT:    movi v25.4s, #4, lsl #8
+; NEON-NEXT:    eor v22.16b, v22.16b, v24.16b
+; NEON-NEXT:    eor v19.16b, v28.16b, v31.16b
+; NEON-NEXT:    movi v28.4s, #128, lsl #8
+; NEON-NEXT:    eor v24.16b, v27.16b, v26.16b
+; NEON-NEXT:    and v20.16b, v2.16b, v20.16b
+; NEON-NEXT:    xtn v6.4h, v6.4s
+; NEON-NEXT:    umull2 v27.4s, v0.8h, v29.8h
+; NEON-NEXT:    umull2 v5.4s, v0.8h, v30.8h
+; NEON-NEXT:    and v29.16b, v1.16b, v9.16b
+; NEON-NEXT:    eor v23.16b, v23.16b, v8.16b
+; NEON-NEXT:    and v26.16b, v1.16b, v25.16b
+; NEON-NEXT:    xtn v7.4h, v7.4s
+; NEON-NEXT:    and v1.16b, v1.16b, v28.16b
+; NEON-NEXT:    and v4.16b, v2.16b, v4.16b
+; NEON-NEXT:    xtn v18.4h, v18.4s
+; NEON-NEXT:    xtn v20.4h, v20.4s
+; NEON-NEXT:    and v3.16b, v2.16b, v3.16b
+; NEON-NEXT:    and v21.16b, v2.16b, v21.16b
+; NEON-NEXT:    eor v23.16b, v23.16b, v27.16b
+; NEON-NEXT:    uzp1 v27.8h, v0.8h, v29.8h
+; NEON-NEXT:    eor v29.16b, v5.16b, v5.16b
+; NEON-NEXT:    uzp1 v26.8h, v0.8h, v26.8h
+; NEON-NEXT:    uzp1 v1.8h, v0.8h, v1.8h
+; NEON-NEXT:    xtn v4.4h, v4.4s
+; NEON-NEXT:    xtn v3.4h, v3.4s
+; NEON-NEXT:    umull v6.4s, v0.4h, v6.4h
+; NEON-NEXT:    umull v7.4s, v0.4h, v7.4h
+; NEON-NEXT:    eor v29.16b, v29.16b, v5.16b
+; NEON-NEXT:    and v25.16b, v2.16b, v25.16b
+; NEON-NEXT:    umull v18.4s, v0.4h, v18.4h
+; NEON-NEXT:    umull2 v27.4s, v0.8h, v27.8h
+; NEON-NEXT:    umull v20.4s, v0.4h, v20.4h
+; NEON-NEXT:    xtn v21.4h, v21.4s
+; NEON-NEXT:    umull2 v26.4s, v0.8h, v26.8h
+; NEON-NEXT:    and v30.16b, v2.16b, v9.16b
+; NEON-NEXT:    umull2 v1.4s, v0.8h, v1.8h
+; NEON-NEXT:    eor v29.16b, v29.16b, v5.16b
+; NEON-NEXT:    xtn v25.4h, v25.4s
+; NEON-NEXT:    umull v4.4s, v0.4h, v4.4h
+; NEON-NEXT:    and v2.16b, v2.16b, v28.16b
+; NEON-NEXT:    umull v3.4s, v0.4h, v3.4h
+; NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-NEXT:    eor v23.16b, v23.16b, v27.16b
+; NEON-NEXT:    xtn v27.4h, v30.4s
+; NEON-NEXT:    eor v7.16b, v18.16b, v20.16b
+; NEON-NEXT:    eor v24.16b, v24.16b, v26.16b
+; NEON-NEXT:    eor v26.16b, v29.16b, v5.16b
+; NEON-NEXT:    umull v18.4s, v0.4h, v21.4h
+; NEON-NEXT:    xtn v2.4h, v2.4s
+; NEON-NEXT:    eor v16.16b, v16.16b, v17.16b
+; NEON-NEXT:    umull v17.4s, v0.4h, v25.4h
+; NEON-NEXT:    eor v1.16b, v23.16b, v1.16b
+; NEON-NEXT:    eor v4.16b, v6.16b, v4.16b
+; NEON-NEXT:    eor v3.16b, v19.16b, v3.16b
+; NEON-NEXT:    eor v20.16b, v22.16b, v24.16b
+; NEON-NEXT:    eor v21.16b, v26.16b, v5.16b
+; NEON-NEXT:    umull v6.4s, v0.4h, v27.4h
+; NEON-NEXT:    eor v7.16b, v7.16b, v18.16b
+; NEON-NEXT:    umull v0.4s, v0.4h, v2.4h
+; NEON-NEXT:    eor v3.16b, v16.16b, v3.16b
+; NEON-NEXT:    eor v4.16b, v4.16b, v17.16b
+; NEON-NEXT:    eor v1.16b, v20.16b, v1.16b
+; NEON-NEXT:    eor v18.16b, v21.16b, v5.16b
+; NEON-NEXT:    eor v2.16b, v7.16b, v6.16b
+; NEON-NEXT:    eor v3.16b, v3.16b, v4.16b
+; NEON-NEXT:    eor v1.16b, v1.16b, v18.16b
+; NEON-NEXT:    eor v6.16b, v18.16b, v5.16b
+; NEON-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NEON-NEXT:    eor v1.16b, v1.16b, v6.16b
+; NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-NEXT:    eor v1.16b, v1.16b, v5.16b
+; NEON-NEXT:    ldp d9, d8, [sp], #16 // 16-byte Folded Reload
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v8i32_neon_zext:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    stp d9, d8, [sp, #-16]! // 16-byte Folded Spill
+; NEON-AES-NEXT:    .cfi_def_cfa_offset 16
+; NEON-AES-NEXT:    .cfi_offset b8, -8
+; NEON-AES-NEXT:    .cfi_offset b9, -16
+; NEON-AES-NEXT:    movi v4.4s, #2
+; NEON-AES-NEXT:    movi v5.4s, #1
+; NEON-AES-NEXT:    movi v6.4s, #4
+; NEON-AES-NEXT:    ushll v2.4s, v1.4h, #0
+; NEON-AES-NEXT:    movi v3.4s, #8
+; NEON-AES-NEXT:    ushll2 v1.4s, v1.8h, #0
+; NEON-AES-NEXT:    movi v20.4s, #16
+; NEON-AES-NEXT:    movi v21.4s, #32
+; NEON-AES-NEXT:    and v17.16b, v2.16b, v4.16b
+; NEON-AES-NEXT:    and v7.16b, v2.16b, v5.16b
+; NEON-AES-NEXT:    and v16.16b, v2.16b, v6.16b
+; NEON-AES-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-AES-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-AES-NEXT:    and v18.16b, v1.16b, v3.16b
+; NEON-AES-NEXT:    and v3.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    and v24.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    xtn v17.4h, v17.4s
+; NEON-AES-NEXT:    xtn v7.4h, v7.4s
+; NEON-AES-NEXT:    xtn v19.4h, v16.4s
+; NEON-AES-NEXT:    uzp1 v4.8h, v0.8h, v4.8h
+; NEON-AES-NEXT:    uzp1 v22.8h, v0.8h, v5.8h
+; NEON-AES-NEXT:    uzp1 v23.8h, v0.8h, v6.8h
+; NEON-AES-NEXT:    uzp1 v18.8h, v0.8h, v18.8h
+; NEON-AES-NEXT:    and v25.16b, v1.16b, v21.16b
+; NEON-AES-NEXT:    movi v6.4s, #128
+; NEON-AES-NEXT:    uzp1 v24.8h, v0.8h, v24.8h
+; NEON-AES-NEXT:    and v28.16b, v2.16b, v20.16b
+; NEON-AES-NEXT:    and v21.16b, v2.16b, v21.16b
+; NEON-AES-NEXT:    umull v5.4s, v0.4h, v17.4h
+; NEON-AES-NEXT:    umull v16.4s, v0.4h, v7.4h
+; NEON-AES-NEXT:    umull v17.4s, v0.4h, v19.4h
+; NEON-AES-NEXT:    xtn v19.4h, v3.4s
+; NEON-AES-NEXT:    movi v3.4s, #64
+; NEON-AES-NEXT:    movi v7.4s, #1, lsl #8
+; NEON-AES-NEXT:    umull2 v26.4s, v0.8h, v4.8h
+; NEON-AES-NEXT:    umull2 v22.4s, v0.8h, v22.8h
+; NEON-AES-NEXT:    umull2 v23.4s, v0.8h, v23.8h
+; NEON-AES-NEXT:    umull2 v27.4s, v0.8h, v18.8h
+; NEON-AES-NEXT:    uzp1 v25.8h, v0.8h, v25.8h
+; NEON-AES-NEXT:    movi v4.4s, #2, lsl #8
+; NEON-AES-NEXT:    and v30.16b, v1.16b, v6.16b
+; NEON-AES-NEXT:    movi v18.4s, #8, lsl #8
+; NEON-AES-NEXT:    movi v20.4s, #16, lsl #8
+; NEON-AES-NEXT:    and v29.16b, v1.16b, v3.16b
+; NEON-AES-NEXT:    and v31.16b, v1.16b, v7.16b
+; NEON-AES-NEXT:    umull2 v24.4s, v0.8h, v24.8h
+; NEON-AES-NEXT:    eor v22.16b, v22.16b, v26.16b
+; NEON-AES-NEXT:    xtn v28.4h, v28.4s
+; NEON-AES-NEXT:    umull v19.4s, v0.4h, v19.4h
+; NEON-AES-NEXT:    eor v23.16b, v23.16b, v27.16b
+; NEON-AES-NEXT:    umull2 v25.4s, v0.8h, v25.8h
+; NEON-AES-NEXT:    uzp1 v27.8h, v0.8h, v30.8h
+; NEON-AES-NEXT:    uzp1 v26.8h, v0.8h, v29.8h
+; NEON-AES-NEXT:    uzp1 v29.8h, v0.8h, v31.8h
+; NEON-AES-NEXT:    and v30.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    xtn v31.4h, v21.4s
+; NEON-AES-NEXT:    movi v21.4s, #32, lsl #8
+; NEON-AES-NEXT:    and v8.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    eor v22.16b, v22.16b, v23.16b
+; NEON-AES-NEXT:    and v23.16b, v1.16b, v18.16b
+; NEON-AES-NEXT:    umull v28.4s, v0.4h, v28.4h
+; NEON-AES-NEXT:    eor v24.16b, v24.16b, v25.16b
+; NEON-AES-NEXT:    umull2 v27.4s, v0.8h, v27.8h
+; NEON-AES-NEXT:    eor v16.16b, v16.16b, v5.16b
+; NEON-AES-NEXT:    umull2 v25.4s, v0.8h, v26.8h
+; NEON-AES-NEXT:    uzp1 v26.8h, v0.8h, v30.8h
+; NEON-AES-NEXT:    umull2 v29.4s, v0.8h, v29.8h
+; NEON-AES-NEXT:    movi v30.2d, #0000000000000000
+; NEON-AES-NEXT:    uzp1 v23.8h, v0.8h, v23.8h
+; NEON-AES-NEXT:    uzp1 v8.8h, v0.8h, v8.8h
+; NEON-AES-NEXT:    and v9.16b, v1.16b, v21.16b
+; NEON-AES-NEXT:    umull v31.4s, v0.4h, v31.4h
+; NEON-AES-NEXT:    eor v17.16b, v17.16b, v19.16b
+; NEON-AES-NEXT:    and v6.16b, v2.16b, v6.16b
+; NEON-AES-NEXT:    and v7.16b, v2.16b, v7.16b
+; NEON-AES-NEXT:    and v18.16b, v2.16b, v18.16b
+; NEON-AES-NEXT:    umull2 v26.4s, v0.8h, v26.8h
+; NEON-AES-NEXT:    eor v27.16b, v27.16b, v29.16b
+; NEON-AES-NEXT:    eor v24.16b, v24.16b, v25.16b
+; NEON-AES-NEXT:    uzp1 v29.8h, v0.8h, v9.8h
+; NEON-AES-NEXT:    uzp1 v30.8h, v0.8h, v30.8h
+; NEON-AES-NEXT:    movi v9.4s, #64, lsl #8
+; NEON-AES-NEXT:    umull2 v23.4s, v0.8h, v23.8h
+; NEON-AES-NEXT:    umull2 v8.4s, v0.8h, v8.8h
+; NEON-AES-NEXT:    movi v25.4s, #4, lsl #8
+; NEON-AES-NEXT:    eor v22.16b, v22.16b, v24.16b
+; NEON-AES-NEXT:    eor v19.16b, v28.16b, v31.16b
+; NEON-AES-NEXT:    movi v28.4s, #128, lsl #8
+; NEON-AES-NEXT:    eor v24.16b, v27.16b, v26.16b
+; NEON-AES-NEXT:    and v20.16b, v2.16b, v20.16b
+; NEON-AES-NEXT:    xtn v6.4h, v6.4s
+; NEON-AES-NEXT:    umull2 v27.4s, v0.8h, v29.8h
+; NEON-AES-NEXT:    umull2 v5.4s, v0.8h, v30.8h
+; NEON-AES-NEXT:    and v29.16b, v1.16b, v9.16b
+; NEON-AES-NEXT:    eor v23.16b, v23.16b, v8.16b
+; NEON-AES-NEXT:    and v26.16b, v1.16b, v25.16b
+; NEON-AES-NEXT:    xtn v7.4h, v7.4s
+; NEON-AES-NEXT:    and v1.16b, v1.16b, v28.16b
+; NEON-AES-NEXT:    and v4.16b, v2.16b, v4.16b
+; NEON-AES-NEXT:    xtn v18.4h, v18.4s
+; NEON-AES-NEXT:    xtn v20.4h, v20.4s
+; NEON-AES-NEXT:    and v3.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    and v21.16b, v2.16b, v21.16b
+; NEON-AES-NEXT:    eor v23.16b, v23.16b, v27.16b
+; NEON-AES-NEXT:    uzp1 v27.8h, v0.8h, v29.8h
+; NEON-AES-NEXT:    eor v29.16b, v5.16b, v5.16b
+; NEON-AES-NEXT:    uzp1 v26.8h, v0.8h, v26.8h
+; NEON-AES-NEXT:    uzp1 v1.8h, v0.8h, v1.8h
+; NEON-AES-NEXT:    xtn v4.4h, v4.4s
+; NEON-AES-NEXT:    xtn v3.4h, v3.4s
+; NEON-AES-NEXT:    umull v6.4s, v0.4h, v6.4h
+; NEON-AES-NEXT:    umull v7.4s, v0.4h, v7.4h
+; NEON-AES-NEXT:    eor v29.16b, v29.16b, v5.16b
+; NEON-AES-NEXT:    and v25.16b, v2.16b, v25.16b
+; NEON-AES-NEXT:    umull v18.4s, v0.4h, v18.4h
+; NEON-AES-NEXT:    umull2 v27.4s, v0.8h, v27.8h
+; NEON-AES-NEXT:    umull v20.4s, v0.4h, v20.4h
+; NEON-AES-NEXT:    xtn v21.4h, v21.4s
+; NEON-AES-NEXT:    umull2 v26.4s, v0.8h, v26.8h
+; NEON-AES-NEXT:    and v30.16b, v2.16b, v9.16b
+; NEON-AES-NEXT:    umull2 v1.4s, v0.8h, v1.8h
+; NEON-AES-NEXT:    eor v29.16b, v29.16b, v5.16b
+; NEON-AES-NEXT:    xtn v25.4h, v25.4s
+; NEON-AES-NEXT:    umull v4.4s, v0.4h, v4.4h
+; NEON-AES-NEXT:    and v2.16b, v2.16b, v28.16b
+; NEON-AES-NEXT:    umull v3.4s, v0.4h, v3.4h
+; NEON-AES-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-AES-NEXT:    eor v23.16b, v23.16b, v27.16b
+; NEON-AES-NEXT:    xtn v27.4h, v30.4s
+; NEON-AES-NEXT:    eor v7.16b, v18.16b, v20.16b
+; NEON-AES-NEXT:    eor v24.16b, v24.16b, v26.16b
+; NEON-AES-NEXT:    eor v26.16b, v29.16b, v5.16b
+; NEON-AES-NEXT:    umull v18.4s, v0.4h, v21.4h
+; NEON-AES-NEXT:    xtn v2.4h, v2.4s
+; NEON-AES-NEXT:    eor v16.16b, v16.16b, v17.16b
+; NEON-AES-NEXT:    umull v17.4s, v0.4h, v25.4h
+; NEON-AES-NEXT:    eor v1.16b, v23.16b, v1.16b
+; NEON-AES-NEXT:    eor v4.16b, v6.16b, v4.16b
+; NEON-AES-NEXT:    eor v3.16b, v19.16b, v3.16b
+; NEON-AES-NEXT:    eor v20.16b, v22.16b, v24.16b
+; NEON-AES-NEXT:    eor v21.16b, v26.16b, v5.16b
+; NEON-AES-NEXT:    umull v6.4s, v0.4h, v27.4h
+; NEON-AES-NEXT:    eor v7.16b, v7.16b, v18.16b
+; NEON-AES-NEXT:    umull v0.4s, v0.4h, v2.4h
+; NEON-AES-NEXT:    eor v3.16b, v16.16b, v3.16b
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v17.16b
+; NEON-AES-NEXT:    eor v1.16b, v20.16b, v1.16b
+; NEON-AES-NEXT:    eor v18.16b, v21.16b, v5.16b
+; NEON-AES-NEXT:    eor v2.16b, v7.16b, v6.16b
+; NEON-AES-NEXT:    eor v3.16b, v3.16b, v4.16b
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v18.16b
+; NEON-AES-NEXT:    eor v6.16b, v18.16b, v5.16b
+; NEON-AES-NEXT:    eor v0.16b, v2.16b, v0.16b
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v6.16b
+; NEON-AES-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v5.16b
+; NEON-AES-NEXT:    ldp d9, d8, [sp], #16 // 16-byte Folded Reload
+; NEON-AES-NEXT:    ret
+  %zextx = zext <8 x i16> %x to <8 x i32>
+  %zexty = zext <8 x i16> %y to <8 x i32>
+  %a = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %zextx, <8 x i32> %zexty)
+  ret <8 x i32> %a
+}
+
+define <2 x i64> @clmul_v2i64_neon_zext(<2 x i32> %x, <2 x i32> %y) {
+; NEON-LABEL: clmul_v2i64_neon_zext:
+; NEON:       // %bb.0:
+; NEON-NEXT:    mov w8, #2 // =0x2
+; NEON-NEXT:    mov w9, #4 // =0x4
+; NEON-NEXT:    ushll v1.2d, v1.2s, #0
+; NEON-NEXT:    dup v2.2d, x8
+; NEON-NEXT:    mov w8, #1 // =0x1
+; NEON-NEXT:    dup v4.2d, x9
+; NEON-NEXT:    dup v3.2d, x8
+; NEON-NEXT:    mov w8, #8 // =0x8
+; NEON-NEXT:    mov w9, #32 // =0x20
+; NEON-NEXT:    dup v5.2d, x8
+; NEON-NEXT:    mov w8, #16 // =0x10
+; NEON-NEXT:    dup v7.2d, x9
+; NEON-NEXT:    dup v6.2d, x8
+; NEON-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-NEXT:    mov w8, #64 // =0x40
+; NEON-NEXT:    movi v24.2d, #0000000000000000
+; NEON-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-NEXT:    dup v16.2d, x8
+; NEON-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-NEXT:    xtn v2.2s, v2.2d
+; NEON-NEXT:    mov w8, #128 // =0x80
+; NEON-NEXT:    xtn v3.2s, v3.2d
+; NEON-NEXT:    xtn v4.2s, v4.2d
+; NEON-NEXT:    dup v17.2d, x8
+; NEON-NEXT:    xtn v5.2s, v5.2d
+; NEON-NEXT:    mov w8, #256 // =0x100
+; NEON-NEXT:    xtn v7.2s, v7.2d
+; NEON-NEXT:    xtn v6.2s, v6.2d
+; NEON-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-NEXT:    dup v18.2d, x8
+; NEON-NEXT:    mov w8, #512 // =0x200
+; NEON-NEXT:    umull v2.2d, v0.2s, v2.2s
+; NEON-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
+; NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
+; NEON-NEXT:    fmov v26.2d, #2.00000000
+; NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
+; NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
+; NEON-NEXT:    xtn v16.2s, v16.2d
+; NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
+; NEON-NEXT:    fneg v24.2d, v24.2d
+; NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-NEXT:    and v26.16b, v1.16b, v26.16b
+; NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-NEXT:    and v4.16b, v1.16b, v18.16b
+; NEON-NEXT:    dup v5.2d, x8
+; NEON-NEXT:    mov w8, #2048 // =0x800
+; NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
+; NEON-NEXT:    xtn v16.2s, v17.2d
+; NEON-NEXT:    dup v17.2d, x8
+; NEON-NEXT:    mov w8, #4096 // =0x1000
+; NEON-NEXT:    dup v18.2d, x8
+; NEON-NEXT:    mov w8, #1024 // =0x400
+; NEON-NEXT:    xtn v4.2s, v4.2d
+; NEON-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    dup v3.2d, x8
+; NEON-NEXT:    mov w8, #8192 // =0x2000
+; NEON-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-NEXT:    and v7.16b, v1.16b, v18.16b
+; NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
+; NEON-NEXT:    dup v18.2d, x8
+; NEON-NEXT:    mov w8, #65536 // =0x10000
+; NEON-NEXT:    xtn v5.2s, v5.2d
+; NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
+; NEON-NEXT:    xtn v17.2s, v17.2d
+; NEON-NEXT:    eor v2.16b, v2.16b, v6.16b
+; NEON-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-NEXT:    xtn v6.2s, v7.2d
+; NEON-NEXT:    dup v7.2d, x8
+; NEON-NEXT:    mov w8, #131072 // =0x20000
+; NEON-NEXT:    dup v19.2d, x8
+; NEON-NEXT:    mov w8, #16384 // =0x4000
+; NEON-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
+; NEON-NEXT:    eor v4.16b, v16.16b, v4.16b
+; NEON-NEXT:    xtn v3.2s, v3.2d
+; NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
+; NEON-NEXT:    dup v17.2d, x8
+; NEON-NEXT:    mov w8, #262144 // =0x40000
+; NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
+; NEON-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-NEXT:    xtn v18.2s, v18.2d
+; NEON-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-NEXT:    dup v5.2d, x8
+; NEON-NEXT:    mov w8, #32768 // =0x8000
+; NEON-NEXT:    xtn v7.2s, v7.2d
+; NEON-NEXT:    xtn v19.2s, v19.2d
+; NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
+; NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
+; NEON-NEXT:    dup v16.2d, x8
+; NEON-NEXT:    mov w8, #4194304 // =0x400000
+; NEON-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-NEXT:    dup v20.2d, x8
+; NEON-NEXT:    mov w8, #8388608 // =0x800000
+; NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-NEXT:    dup v21.2d, x8
+; NEON-NEXT:    mov w8, #16777216 // =0x1000000
+; NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
+; NEON-NEXT:    xtn v17.2s, v17.2d
+; NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-NEXT:    xtn v5.2s, v5.2d
+; NEON-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-NEXT:    eor v3.16b, v4.16b, v3.16b
+; NEON-NEXT:    dup v4.2d, x8
+; NEON-NEXT:    mov w8, #524288 // =0x80000
+; NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-NEXT:    dup v22.2d, x8
+; NEON-NEXT:    mov w8, #1048576 // =0x100000
+; NEON-NEXT:    eor v6.16b, v6.16b, v18.16b
+; NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; NEON-NEXT:    xtn v16.2s, v16.2d
+; NEON-NEXT:    eor v7.16b, v7.16b, v19.16b
+; NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
+; NEON-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-NEXT:    xtn v19.2s, v20.2d
+; NEON-NEXT:    xtn v20.2s, v21.2d
+; NEON-NEXT:    and v18.16b, v1.16b, v22.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    movi v22.4s, #128, lsl #24
+; NEON-NEXT:    xtn v21.2s, v4.2d
+; NEON-NEXT:    eor v3.16b, v6.16b, v17.16b
+; NEON-NEXT:    dup v17.2d, x8
+; NEON-NEXT:    mov w8, #2097152 // =0x200000
+; NEON-NEXT:    umull v4.2d, v0.2s, v16.2s
+; NEON-NEXT:    eor v5.16b, v7.16b, v5.16b
+; NEON-NEXT:    umull v7.2d, v0.2s, v19.2s
+; NEON-NEXT:    umull v16.2d, v0.2s, v20.2s
+; NEON-NEXT:    xtn v6.2s, v18.2d
+; NEON-NEXT:    dup v18.2d, x8
+; NEON-NEXT:    mov w8, #33554432 // =0x2000000
+; NEON-NEXT:    fneg v22.2d, v22.2d
+; NEON-NEXT:    dup v20.2d, x8
+; NEON-NEXT:    mov w8, #67108864 // =0x4000000
+; NEON-NEXT:    umull v19.2d, v0.2s, v21.2s
+; NEON-NEXT:    dup v21.2d, x8
+; NEON-NEXT:    mov w8, #536870912 // =0x20000000
+; NEON-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-NEXT:    eor v7.16b, v7.16b, v16.16b
+; NEON-NEXT:    dup v16.2d, x8
+; NEON-NEXT:    mov w8, #1073741824 // =0x40000000
+; NEON-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-NEXT:    dup v23.2d, x8
+; NEON-NEXT:    mov w8, #134217728 // =0x8000000
+; NEON-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-NEXT:    xtn v17.2s, v17.2d
+; NEON-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-NEXT:    eor v7.16b, v7.16b, v19.16b
+; NEON-NEXT:    dup v19.2d, x8
+; NEON-NEXT:    mov w8, #268435456 // =0x10000000
+; NEON-NEXT:    xtn v20.2s, v20.2d
+; NEON-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-NEXT:    and v23.16b, v1.16b, v23.16b
+; NEON-NEXT:    xtn v21.2s, v21.2d
+; NEON-NEXT:    dup v25.2d, x8
+; NEON-NEXT:    and v22.16b, v1.16b, v22.16b
+; NEON-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
+; NEON-NEXT:    xtn v18.2s, v18.2d
+; NEON-NEXT:    xtn v16.2s, v16.2d
+; NEON-NEXT:    xtn v23.2s, v23.2d
+; NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
+; NEON-NEXT:    and v25.16b, v1.16b, v25.16b
+; NEON-NEXT:    xtn v22.2s, v22.2d
+; NEON-NEXT:    xtn v19.2s, v19.2d
+; NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
+; NEON-NEXT:    eor v3.16b, v3.16b, v4.16b
+; NEON-NEXT:    eor v4.16b, v5.16b, v6.16b
+; NEON-NEXT:    and v1.16b, v1.16b, v24.16b
+; NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
+; NEON-NEXT:    umull v23.2d, v0.2s, v23.2s
+; NEON-NEXT:    xtn v6.2s, v25.2d
+; NEON-NEXT:    eor v5.16b, v7.16b, v20.16b
+; NEON-NEXT:    xtn v7.2s, v26.2d
+; NEON-NEXT:    umull v20.2d, v0.2s, v22.2s
+; NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-NEXT:    eor v4.16b, v4.16b, v17.16b
+; NEON-NEXT:    xtn v1.2s, v1.2d
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v5.16b, v5.16b, v21.16b
+; NEON-NEXT:    eor v16.16b, v16.16b, v23.16b
+; NEON-NEXT:    umull v3.2d, v0.2s, v6.2s
+; NEON-NEXT:    umull v6.2d, v0.2s, v7.2s
+; NEON-NEXT:    eor v4.16b, v4.16b, v18.16b
+; NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
+; NEON-NEXT:    eor v5.16b, v5.16b, v19.16b
+; NEON-NEXT:    eor v7.16b, v16.16b, v20.16b
+; NEON-NEXT:    eor v1.16b, v2.16b, v4.16b
+; NEON-NEXT:    eor v2.16b, v5.16b, v3.16b
+; NEON-NEXT:    eor v3.16b, v7.16b, v6.16b
+; NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
+; NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v2i64_neon_zext:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    mov w8, #2 // =0x2
+; NEON-AES-NEXT:    mov w9, #4 // =0x4
+; NEON-AES-NEXT:    ushll v1.2d, v1.2s, #0
+; NEON-AES-NEXT:    dup v2.2d, x8
+; NEON-AES-NEXT:    mov w8, #1 // =0x1
+; NEON-AES-NEXT:    dup v4.2d, x9
+; NEON-AES-NEXT:    dup v3.2d, x8
+; NEON-AES-NEXT:    mov w8, #8 // =0x8
+; NEON-AES-NEXT:    mov w9, #32 // =0x20
+; NEON-AES-NEXT:    dup v5.2d, x8
+; NEON-AES-NEXT:    mov w8, #16 // =0x10
+; NEON-AES-NEXT:    dup v7.2d, x9
+; NEON-AES-NEXT:    dup v6.2d, x8
+; NEON-AES-NEXT:    and v2.16b, v1.16b, v2.16b
+; NEON-AES-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-AES-NEXT:    mov w8, #64 // =0x40
+; NEON-AES-NEXT:    movi v24.2d, #0000000000000000
+; NEON-AES-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-AES-NEXT:    dup v16.2d, x8
+; NEON-AES-NEXT:    and v6.16b, v1.16b, v6.16b
+; NEON-AES-NEXT:    xtn v2.2s, v2.2d
+; NEON-AES-NEXT:    mov w8, #128 // =0x80
+; NEON-AES-NEXT:    xtn v3.2s, v3.2d
+; NEON-AES-NEXT:    xtn v4.2s, v4.2d
+; NEON-AES-NEXT:    dup v17.2d, x8
+; NEON-AES-NEXT:    xtn v5.2s, v5.2d
+; NEON-AES-NEXT:    mov w8, #256 // =0x100
+; NEON-AES-NEXT:    xtn v7.2s, v7.2d
+; NEON-AES-NEXT:    xtn v6.2s, v6.2d
+; NEON-AES-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    dup v18.2d, x8
+; NEON-AES-NEXT:    mov w8, #512 // =0x200
+; NEON-AES-NEXT:    umull v2.2d, v0.2s, v2.2s
+; NEON-AES-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    umull v3.2d, v0.2s, v3.2s
+; NEON-AES-NEXT:    umull v4.2d, v0.2s, v4.2s
+; NEON-AES-NEXT:    fmov v26.2d, #2.00000000
+; NEON-AES-NEXT:    umull v5.2d, v0.2s, v5.2s
+; NEON-AES-NEXT:    umull v7.2d, v0.2s, v7.2s
+; NEON-AES-NEXT:    xtn v16.2s, v16.2d
+; NEON-AES-NEXT:    umull v6.2d, v0.2s, v6.2s
+; NEON-AES-NEXT:    fneg v24.2d, v24.2d
+; NEON-AES-NEXT:    eor v2.16b, v3.16b, v2.16b
+; NEON-AES-NEXT:    and v26.16b, v1.16b, v26.16b
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    and v4.16b, v1.16b, v18.16b
+; NEON-AES-NEXT:    dup v5.2d, x8
+; NEON-AES-NEXT:    mov w8, #2048 // =0x800
+; NEON-AES-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-AES-NEXT:    umull v7.2d, v0.2s, v16.2s
+; NEON-AES-NEXT:    xtn v16.2s, v17.2d
+; NEON-AES-NEXT:    dup v17.2d, x8
+; NEON-AES-NEXT:    mov w8, #4096 // =0x1000
+; NEON-AES-NEXT:    dup v18.2d, x8
+; NEON-AES-NEXT:    mov w8, #1024 // =0x400
+; NEON-AES-NEXT:    xtn v4.2s, v4.2d
+; NEON-AES-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    dup v3.2d, x8
+; NEON-AES-NEXT:    mov w8, #8192 // =0x2000
+; NEON-AES-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    eor v6.16b, v6.16b, v7.16b
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v18.16b
+; NEON-AES-NEXT:    umull v16.2d, v0.2s, v16.2s
+; NEON-AES-NEXT:    dup v18.2d, x8
+; NEON-AES-NEXT:    mov w8, #65536 // =0x10000
+; NEON-AES-NEXT:    xtn v5.2s, v5.2d
+; NEON-AES-NEXT:    umull v4.2d, v0.2s, v4.2s
+; NEON-AES-NEXT:    xtn v17.2s, v17.2d
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v6.16b
+; NEON-AES-NEXT:    and v3.16b, v1.16b, v3.16b
+; NEON-AES-NEXT:    xtn v6.2s, v7.2d
+; NEON-AES-NEXT:    dup v7.2d, x8
+; NEON-AES-NEXT:    mov w8, #131072 // =0x20000
+; NEON-AES-NEXT:    dup v19.2d, x8
+; NEON-AES-NEXT:    mov w8, #16384 // =0x4000
+; NEON-AES-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-AES-NEXT:    umull v5.2d, v0.2s, v5.2s
+; NEON-AES-NEXT:    eor v4.16b, v16.16b, v4.16b
+; NEON-AES-NEXT:    xtn v3.2s, v3.2d
+; NEON-AES-NEXT:    umull v16.2d, v0.2s, v17.2s
+; NEON-AES-NEXT:    dup v17.2d, x8
+; NEON-AES-NEXT:    mov w8, #262144 // =0x40000
+; NEON-AES-NEXT:    umull v6.2d, v0.2s, v6.2s
+; NEON-AES-NEXT:    and v7.16b, v1.16b, v7.16b
+; NEON-AES-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-AES-NEXT:    xtn v18.2s, v18.2d
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v5.16b
+; NEON-AES-NEXT:    dup v5.2d, x8
+; NEON-AES-NEXT:    mov w8, #32768 // =0x8000
+; NEON-AES-NEXT:    xtn v7.2s, v7.2d
+; NEON-AES-NEXT:    xtn v19.2s, v19.2d
+; NEON-AES-NEXT:    umull v3.2d, v0.2s, v3.2s
+; NEON-AES-NEXT:    eor v6.16b, v16.16b, v6.16b
+; NEON-AES-NEXT:    dup v16.2d, x8
+; NEON-AES-NEXT:    mov w8, #4194304 // =0x400000
+; NEON-AES-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    and v5.16b, v1.16b, v5.16b
+; NEON-AES-NEXT:    dup v20.2d, x8
+; NEON-AES-NEXT:    mov w8, #8388608 // =0x800000
+; NEON-AES-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-AES-NEXT:    dup v21.2d, x8
+; NEON-AES-NEXT:    mov w8, #16777216 // =0x1000000
+; NEON-AES-NEXT:    umull v7.2d, v0.2s, v7.2s
+; NEON-AES-NEXT:    xtn v17.2s, v17.2d
+; NEON-AES-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-AES-NEXT:    xtn v5.2s, v5.2d
+; NEON-AES-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    dup v4.2d, x8
+; NEON-AES-NEXT:    mov w8, #524288 // =0x80000
+; NEON-AES-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-AES-NEXT:    dup v22.2d, x8
+; NEON-AES-NEXT:    mov w8, #1048576 // =0x100000
+; NEON-AES-NEXT:    eor v6.16b, v6.16b, v18.16b
+; NEON-AES-NEXT:    umull v17.2d, v0.2s, v17.2s
+; NEON-AES-NEXT:    xtn v16.2s, v16.2d
+; NEON-AES-NEXT:    eor v7.16b, v7.16b, v19.16b
+; NEON-AES-NEXT:    umull v5.2d, v0.2s, v5.2s
+; NEON-AES-NEXT:    and v4.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    xtn v19.2s, v20.2d
+; NEON-AES-NEXT:    xtn v20.2s, v21.2d
+; NEON-AES-NEXT:    and v18.16b, v1.16b, v22.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    movi v22.4s, #128, lsl #24
+; NEON-AES-NEXT:    xtn v21.2s, v4.2d
+; NEON-AES-NEXT:    eor v3.16b, v6.16b, v17.16b
+; NEON-AES-NEXT:    dup v17.2d, x8
+; NEON-AES-NEXT:    mov w8, #2097152 // =0x200000
+; NEON-AES-NEXT:    umull v4.2d, v0.2s, v16.2s
+; NEON-AES-NEXT:    eor v5.16b, v7.16b, v5.16b
+; NEON-AES-NEXT:    umull v7.2d, v0.2s, v19.2s
+; NEON-AES-NEXT:    umull v16.2d, v0.2s, v20.2s
+; NEON-AES-NEXT:    xtn v6.2s, v18.2d
+; NEON-AES-NEXT:    dup v18.2d, x8
+; NEON-AES-NEXT:    mov w8, #33554432 // =0x2000000
+; NEON-AES-NEXT:    fneg v22.2d, v22.2d
+; NEON-AES-NEXT:    dup v20.2d, x8
+; NEON-AES-NEXT:    mov w8, #67108864 // =0x4000000
+; NEON-AES-NEXT:    umull v19.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    dup v21.2d, x8
+; NEON-AES-NEXT:    mov w8, #536870912 // =0x20000000
+; NEON-AES-NEXT:    and v17.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    eor v7.16b, v7.16b, v16.16b
+; NEON-AES-NEXT:    dup v16.2d, x8
+; NEON-AES-NEXT:    mov w8, #1073741824 // =0x40000000
+; NEON-AES-NEXT:    and v20.16b, v1.16b, v20.16b
+; NEON-AES-NEXT:    dup v23.2d, x8
+; NEON-AES-NEXT:    mov w8, #134217728 // =0x8000000
+; NEON-AES-NEXT:    and v21.16b, v1.16b, v21.16b
+; NEON-AES-NEXT:    xtn v17.2s, v17.2d
+; NEON-AES-NEXT:    and v18.16b, v1.16b, v18.16b
+; NEON-AES-NEXT:    eor v7.16b, v7.16b, v19.16b
+; NEON-AES-NEXT:    dup v19.2d, x8
+; NEON-AES-NEXT:    mov w8, #268435456 // =0x10000000
+; NEON-AES-NEXT:    xtn v20.2s, v20.2d
+; NEON-AES-NEXT:    and v16.16b, v1.16b, v16.16b
+; NEON-AES-NEXT:    and v23.16b, v1.16b, v23.16b
+; NEON-AES-NEXT:    xtn v21.2s, v21.2d
+; NEON-AES-NEXT:    dup v25.2d, x8
+; NEON-AES-NEXT:    and v22.16b, v1.16b, v22.16b
+; NEON-AES-NEXT:    and v19.16b, v1.16b, v19.16b
+; NEON-AES-NEXT:    umull v6.2d, v0.2s, v6.2s
+; NEON-AES-NEXT:    xtn v18.2s, v18.2d
+; NEON-AES-NEXT:    xtn v16.2s, v16.2d
+; NEON-AES-NEXT:    xtn v23.2s, v23.2d
+; NEON-AES-NEXT:    umull v17.2d, v0.2s, v17.2s
+; NEON-AES-NEXT:    umull v20.2d, v0.2s, v20.2s
+; NEON-AES-NEXT:    and v25.16b, v1.16b, v25.16b
+; NEON-AES-NEXT:    xtn v22.2s, v22.2d
+; NEON-AES-NEXT:    xtn v19.2s, v19.2d
+; NEON-AES-NEXT:    umull v21.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    eor v3.16b, v3.16b, v4.16b
+; NEON-AES-NEXT:    eor v4.16b, v5.16b, v6.16b
+; NEON-AES-NEXT:    and v1.16b, v1.16b, v24.16b
+; NEON-AES-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-AES-NEXT:    umull v16.2d, v0.2s, v16.2s
+; NEON-AES-NEXT:    umull v23.2d, v0.2s, v23.2s
+; NEON-AES-NEXT:    xtn v6.2s, v25.2d
+; NEON-AES-NEXT:    eor v5.16b, v7.16b, v20.16b
+; NEON-AES-NEXT:    xtn v7.2s, v26.2d
+; NEON-AES-NEXT:    umull v20.2d, v0.2s, v22.2s
+; NEON-AES-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v17.16b
+; NEON-AES-NEXT:    xtn v1.2s, v1.2d
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v5.16b, v5.16b, v21.16b
+; NEON-AES-NEXT:    eor v16.16b, v16.16b, v23.16b
+; NEON-AES-NEXT:    umull v3.2d, v0.2s, v6.2s
+; NEON-AES-NEXT:    umull v6.2d, v0.2s, v7.2s
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v18.16b
+; NEON-AES-NEXT:    umull v0.2d, v0.2s, v1.2s
+; NEON-AES-NEXT:    eor v5.16b, v5.16b, v19.16b
+; NEON-AES-NEXT:    eor v7.16b, v16.16b, v20.16b
+; NEON-AES-NEXT:    eor v1.16b, v2.16b, v4.16b
+; NEON-AES-NEXT:    eor v2.16b, v5.16b, v3.16b
+; NEON-AES-NEXT:    eor v3.16b, v7.16b, v6.16b
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v2.16b
+; NEON-AES-NEXT:    eor v0.16b, v3.16b, v0.16b
+; NEON-AES-NEXT:    eor v0.16b, v1.16b, v0.16b
+; NEON-AES-NEXT:    ret
+  %zextx = zext <2 x i32> %x to <2 x i64>
+  %zexty = zext <2 x i32> %y to <2 x i64>
+  %a = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %zextx, <2 x i64> %zexty)
+  ret <2 x i64> %a
+}
+
+define <4 x i64> @clmul_v4i64_neon_zext(<4 x i32> %x, <4 x i32> %y) {
+; NEON-LABEL: clmul_v4i64_neon_zext:
+; NEON:       // %bb.0:
+; NEON-NEXT:    sub sp, sp, #320
+; NEON-NEXT:    stp d15, d14, [sp, #240] // 16-byte Folded Spill
+; NEON-NEXT:    stp d13, d12, [sp, #256] // 16-byte Folded Spill
+; NEON-NEXT:    stp d11, d10, [sp, #272] // 16-byte Folded Spill
+; NEON-NEXT:    stp d9, d8, [sp, #288] // 16-byte Folded Spill
+; NEON-NEXT:    str x29, [sp, #304] // 8-byte Spill
+; NEON-NEXT:    .cfi_def_cfa_offset 320
+; NEON-NEXT:    .cfi_offset w29, -16
+; NEON-NEXT:    .cfi_offset b8, -24
+; NEON-NEXT:    .cfi_offset b9, -32
+; NEON-NEXT:    .cfi_offset b10, -40
+; NEON-NEXT:    .cfi_offset b11, -48
+; NEON-NEXT:    .cfi_offset b12, -56
+; NEON-NEXT:    .cfi_offset b13, -64
+; NEON-NEXT:    .cfi_offset b14, -72
+; NEON-NEXT:    .cfi_offset b15, -80
+; NEON-NEXT:    mov w8, #2 // =0x2
+; NEON-NEXT:    mov w9, #1 // =0x1
+; NEON-NEXT:    ushll v6.2d, v1.2s, #0
+; NEON-NEXT:    dup v23.2d, x8
+; NEON-NEXT:    dup v22.2d, x9
+; NEON-NEXT:    mov w8, #4 // =0x4
+; NEON-NEXT:    mov w10, #16 // =0x10
+; NEON-NEXT:    dup v28.2d, x8
+; NEON-NEXT:    mov w9, #8 // =0x8
+; NEON-NEXT:    dup v24.2d, x10
+; NEON-NEXT:    ushll2 v5.2d, v1.4s, #0
+; NEON-NEXT:    dup v25.2d, x9
+; NEON-NEXT:    and v3.16b, v6.16b, v23.16b
+; NEON-NEXT:    and v4.16b, v6.16b, v22.16b
+; NEON-NEXT:    mov w8, #32 // =0x20
+; NEON-NEXT:    and v7.16b, v6.16b, v28.16b
+; NEON-NEXT:    dup v26.2d, x8
+; NEON-NEXT:    mov w8, #64 // =0x40
+; NEON-NEXT:    and v17.16b, v6.16b, v24.16b
+; NEON-NEXT:    and v16.16b, v6.16b, v25.16b
+; NEON-NEXT:    and v25.16b, v5.16b, v25.16b
+; NEON-NEXT:    xtn v3.2s, v3.2d
+; NEON-NEXT:    xtn v4.2s, v4.2d
+; NEON-NEXT:    mov w9, #256 // =0x100
+; NEON-NEXT:    xtn v19.2s, v7.2d
+; NEON-NEXT:    dup v18.2d, x8
+; NEON-NEXT:    mov w8, #128 // =0x80
+; NEON-NEXT:    xtn v29.2s, v17.2d
+; NEON-NEXT:    and v27.16b, v6.16b, v26.16b
+; NEON-NEXT:    ldr x29, [sp, #304] // 8-byte Reload
+; NEON-NEXT:    and v24.16b, v5.16b, v24.16b
+; NEON-NEXT:    and v26.16b, v5.16b, v26.16b
+; NEON-NEXT:    uzp1 v25.4s, v0.4s, v25.4s
+; NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
+; NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
+; NEON-NEXT:    dup v1.2d, x8
+; NEON-NEXT:    mov w8, #512 // =0x200
+; NEON-NEXT:    dup v8.2d, x9
+; NEON-NEXT:    mov w9, #2048 // =0x800
+; NEON-NEXT:    umull v20.2d, v0.2s, v19.2s
+; NEON-NEXT:    umull v19.2d, v0.2s, v29.2s
+; NEON-NEXT:    uzp1 v29.4s, v0.4s, v24.4s
+; NEON-NEXT:    uzp1 v26.4s, v0.4s, v26.4s
+; NEON-NEXT:    and v30.16b, v5.16b, v18.16b
+; NEON-NEXT:    dup v2.2d, x8
+; NEON-NEXT:    mov w8, #1024 // =0x400
+; NEON-NEXT:    stp q8, q1, [sp, #192] // 32-byte Folded Spill
+; NEON-NEXT:    eor v3.16b, v4.16b, v3.16b
+; NEON-NEXT:    and v4.16b, v5.16b, v22.16b
+; NEON-NEXT:    and v22.16b, v5.16b, v28.16b
+; NEON-NEXT:    dup v7.2d, x8
+; NEON-NEXT:    mov w8, #4096 // =0x1000
+; NEON-NEXT:    and v31.16b, v5.16b, v1.16b
+; NEON-NEXT:    and v8.16b, v5.16b, v8.16b
+; NEON-NEXT:    str q3, [sp, #224] // 16-byte Spill
+; NEON-NEXT:    and v3.16b, v5.16b, v23.16b
+; NEON-NEXT:    umull2 v9.2d, v0.4s, v25.4s
+; NEON-NEXT:    str q7, [sp, #96] // 16-byte Spill
+; NEON-NEXT:    uzp1 v4.4s, v0.4s, v4.4s
+; NEON-NEXT:    uzp1 v28.4s, v0.4s, v22.4s
+; NEON-NEXT:    xtn v21.2s, v16.2d
+; NEON-NEXT:    dup v16.2d, x9
+; NEON-NEXT:    mov w9, #32768 // =0x8000
+; NEON-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
+; NEON-NEXT:    dup v17.2d, x8
+; NEON-NEXT:    mov w8, #8192 // =0x2000
+; NEON-NEXT:    uzp1 v30.4s, v0.4s, v30.4s
+; NEON-NEXT:    umull2 v29.2d, v0.4s, v29.4s
+; NEON-NEXT:    umull2 v26.2d, v0.4s, v26.4s
+; NEON-NEXT:    stp q16, q2, [sp, #160] // 32-byte Folded Spill
+; NEON-NEXT:    umull2 v4.2d, v0.4s, v4.4s
+; NEON-NEXT:    umull2 v28.2d, v0.4s, v28.4s
+; NEON-NEXT:    uzp1 v31.4s, v0.4s, v31.4s
+; NEON-NEXT:    uzp1 v8.4s, v0.4s, v8.4s
+; NEON-NEXT:    dup v23.2d, x8
+; NEON-NEXT:    mov w8, #16384 // =0x4000
+; NEON-NEXT:    umull2 v3.2d, v0.4s, v3.4s
+; NEON-NEXT:    and v10.16b, v5.16b, v17.16b
+; NEON-NEXT:    umull2 v30.2d, v0.4s, v30.4s
+; NEON-NEXT:    eor v29.16b, v29.16b, v26.16b
+; NEON-NEXT:    dup v24.2d, x8
+; NEON-NEXT:    mov w8, #65536 // =0x10000
+; NEON-NEXT:    stp q17, q23, [sp, #128] // 32-byte Folded Spill
+; NEON-NEXT:    eor v9.16b, v28.16b, v9.16b
+; NEON-NEXT:    and v28.16b, v5.16b, v16.16b
+; NEON-NEXT:    umull2 v31.2d, v0.4s, v31.4s
+; NEON-NEXT:    umull2 v8.2d, v0.4s, v8.4s
+; NEON-NEXT:    str q24, [sp, #112] // 16-byte Spill
+; NEON-NEXT:    eor v3.16b, v4.16b, v3.16b
+; NEON-NEXT:    and v4.16b, v5.16b, v2.16b
+; NEON-NEXT:    uzp1 v10.4s, v0.4s, v10.4s
+; NEON-NEXT:    and v13.16b, v5.16b, v23.16b
+; NEON-NEXT:    dup v1.2d, x8
+; NEON-NEXT:    mov w8, #131072 // =0x20000
+; NEON-NEXT:    uzp1 v12.4s, v0.4s, v28.4s
+; NEON-NEXT:    and v11.16b, v5.16b, v7.16b
+; NEON-NEXT:    uzp1 v4.4s, v0.4s, v4.4s
+; NEON-NEXT:    dup v17.2d, x8
+; NEON-NEXT:    mov w8, #524288 // =0x80000
+; NEON-NEXT:    eor v3.16b, v3.16b, v9.16b
+; NEON-NEXT:    eor v29.16b, v29.16b, v30.16b
+; NEON-NEXT:    uzp1 v9.4s, v0.4s, v13.4s
+; NEON-NEXT:    dup v22.2d, x9
+; NEON-NEXT:    mov w9, #262144 // =0x40000
+; NEON-NEXT:    eor v31.16b, v31.16b, v8.16b
+; NEON-NEXT:    stp q17, q1, [sp, #64] // 32-byte Folded Spill
+; NEON-NEXT:    umull2 v8.2d, v0.4s, v12.4s
+; NEON-NEXT:    umull2 v10.2d, v0.4s, v10.4s
+; NEON-NEXT:    umull2 v4.2d, v0.4s, v4.4s
+; NEON-NEXT:    dup v2.2d, x9
+; NEON-NEXT:    mov w9, #536870912 // =0x20000000
+; NEON-NEXT:    str q22, [sp, #16] // 16-byte Spill
+; NEON-NEXT:    uzp1 v30.4s, v0.4s, v11.4s
+; NEON-NEXT:    eor v7.16b, v3.16b, v29.16b
+; NEON-NEXT:    and v29.16b, v5.16b, v24.16b
+; NEON-NEXT:    and v11.16b, v5.16b, v1.16b
+; NEON-NEXT:    and v12.16b, v5.16b, v17.16b
+; NEON-NEXT:    dup v16.2d, x8
+; NEON-NEXT:    mov w8, #4194304 // =0x400000
+; NEON-NEXT:    eor v8.16b, v8.16b, v10.16b
+; NEON-NEXT:    eor v4.16b, v31.16b, v4.16b
+; NEON-NEXT:    umull2 v31.2d, v0.4s, v9.4s
+; NEON-NEXT:    and v9.16b, v5.16b, v22.16b
+; NEON-NEXT:    uzp1 v10.4s, v0.4s, v29.4s
+; NEON-NEXT:    uzp1 v11.4s, v0.4s, v11.4s
+; NEON-NEXT:    uzp1 v12.4s, v0.4s, v12.4s
+; NEON-NEXT:    stp q16, q2, [sp, #32] // 32-byte Folded Spill
+; NEON-NEXT:    and v13.16b, v5.16b, v2.16b
+; NEON-NEXT:    xtn v14.2s, v27.2d
+; NEON-NEXT:    umull2 v30.2d, v0.4s, v30.4s
+; NEON-NEXT:    dup v1.2d, x8
+; NEON-NEXT:    mov w8, #8388608 // =0x800000
+; NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
+; NEON-NEXT:    uzp1 v9.4s, v0.4s, v9.4s
+; NEON-NEXT:    eor v8.16b, v8.16b, v31.16b
+; NEON-NEXT:    and v31.16b, v5.16b, v16.16b
+; NEON-NEXT:    uzp1 v13.4s, v0.4s, v13.4s
+; NEON-NEXT:    dup v29.2d, x8
+; NEON-NEXT:    mov w8, #16777216 // =0x1000000
+; NEON-NEXT:    str q1, [sp] // 16-byte Spill
+; NEON-NEXT:    umull2 v10.2d, v0.4s, v10.4s
+; NEON-NEXT:    umull2 v11.2d, v0.4s, v11.4s
+; NEON-NEXT:    umull2 v12.2d, v0.4s, v12.4s
+; NEON-NEXT:    eor v2.16b, v4.16b, v30.16b
+; NEON-NEXT:    umull v15.2d, v0.2s, v14.2s
+; NEON-NEXT:    uzp1 v4.4s, v0.4s, v31.4s
+; NEON-NEXT:    dup v31.2d, x8
+; NEON-NEXT:    mov w8, #1048576 // =0x100000
+; NEON-NEXT:    and v3.16b, v5.16b, v1.16b
+; NEON-NEXT:    and v1.16b, v5.16b, v29.16b
+; NEON-NEXT:    umull2 v9.2d, v0.4s, v9.4s
+; NEON-NEXT:    eor v14.16b, v20.16b, v21.16b
+; NEON-NEXT:    umull2 v21.2d, v0.4s, v13.4s
+; NEON-NEXT:    eor v20.16b, v8.16b, v10.16b
+; NEON-NEXT:    eor v2.16b, v7.16b, v2.16b
+; NEON-NEXT:    eor v7.16b, v11.16b, v12.16b
+; NEON-NEXT:    eor v15.16b, v19.16b, v15.16b
+; NEON-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
+; NEON-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
+; NEON-NEXT:    and v19.16b, v5.16b, v31.16b
+; NEON-NEXT:    umull2 v8.2d, v0.4s, v4.4s
+; NEON-NEXT:    dup v10.2d, x8
+; NEON-NEXT:    mov w8, #33554432 // =0x2000000
+; NEON-NEXT:    eor v20.16b, v20.16b, v9.16b
+; NEON-NEXT:    eor v7.16b, v7.16b, v21.16b
+; NEON-NEXT:    dup v9.2d, x8
+; NEON-NEXT:    mov w8, #67108864 // =0x4000000
+; NEON-NEXT:    dup v28.2d, x9
+; NEON-NEXT:    uzp1 v19.4s, v0.4s, v19.4s
+; NEON-NEXT:    umull2 v11.2d, v0.4s, v3.4s
+; NEON-NEXT:    umull2 v1.2d, v0.4s, v1.4s
+; NEON-NEXT:    eor v25.16b, v2.16b, v20.16b
+; NEON-NEXT:    movi v2.4s, #128, lsl #24
+; NEON-NEXT:    eor v23.16b, v7.16b, v8.16b
+; NEON-NEXT:    dup v8.2d, x8
+; NEON-NEXT:    mov w8, #1073741824 // =0x40000000
+; NEON-NEXT:    and v12.16b, v5.16b, v9.16b
+; NEON-NEXT:    dup v27.2d, x8
+; NEON-NEXT:    mov w8, #2097152 // =0x200000
+; NEON-NEXT:    and v3.16b, v5.16b, v28.16b
+; NEON-NEXT:    umull2 v7.2d, v0.4s, v19.4s
+; NEON-NEXT:    eor v4.16b, v11.16b, v1.16b
+; NEON-NEXT:    and v11.16b, v5.16b, v10.16b
+; NEON-NEXT:    fneg v30.2d, v2.2d
+; NEON-NEXT:    dup v13.2d, x8
+; NEON-NEXT:    mov w8, #134217728 // =0x8000000
+; NEON-NEXT:    and v1.16b, v5.16b, v27.16b
+; NEON-NEXT:    and v2.16b, v5.16b, v8.16b
+; NEON-NEXT:    dup v24.2d, x8
+; NEON-NEXT:    mov w8, #268435456 // =0x10000000
+; NEON-NEXT:    uzp1 v12.4s, v0.4s, v12.4s
+; NEON-NEXT:    uzp1 v17.4s, v0.4s, v3.4s
+; NEON-NEXT:    eor v19.16b, v4.16b, v7.16b
+; NEON-NEXT:    uzp1 v7.4s, v0.4s, v11.4s
+; NEON-NEXT:    and v3.16b, v5.16b, v13.16b
+; NEON-NEXT:    uzp1 v16.4s, v0.4s, v1.4s
+; NEON-NEXT:    uzp1 v11.4s, v0.4s, v2.4s
+; NEON-NEXT:    and v1.16b, v5.16b, v30.16b
+; NEON-NEXT:    and v2.16b, v5.16b, v24.16b
+; NEON-NEXT:    movi v22.2d, #0000000000000000
+; NEON-NEXT:    and v4.16b, v6.16b, v18.16b
+; NEON-NEXT:    umull2 v12.2d, v0.4s, v12.4s
+; NEON-NEXT:    umull2 v18.2d, v0.4s, v17.4s
+; NEON-NEXT:    dup v26.2d, x8
+; NEON-NEXT:    umull2 v20.2d, v0.4s, v7.4s
+; NEON-NEXT:    uzp1 v7.4s, v0.4s, v3.4s
+; NEON-NEXT:    uzp1 v3.4s, v0.4s, v1.4s
+; NEON-NEXT:    umull2 v17.2d, v0.4s, v16.4s
+; NEON-NEXT:    uzp1 v16.4s, v0.4s, v2.4s
+; NEON-NEXT:    umull2 v11.2d, v0.4s, v11.4s
+; NEON-NEXT:    ldp q1, q2, [sp, #192] // 32-byte Folded Reload
+; NEON-NEXT:    xtn v21.2s, v4.2d
+; NEON-NEXT:    eor v19.16b, v19.16b, v12.16b
+; NEON-NEXT:    umull2 v7.2d, v0.4s, v7.4s
+; NEON-NEXT:    umull2 v4.2d, v0.4s, v3.4s
+; NEON-NEXT:    and v3.16b, v5.16b, v26.16b
+; NEON-NEXT:    and v12.16b, v6.16b, v2.16b
+; NEON-NEXT:    uzp1 v2.4s, v0.4s, v22.4s
+; NEON-NEXT:    ldr q22, [sp, #176] // 16-byte Reload
+; NEON-NEXT:    and v1.16b, v6.16b, v1.16b
+; NEON-NEXT:    eor v20.16b, v23.16b, v20.16b
+; NEON-NEXT:    eor v17.16b, v18.16b, v17.16b
+; NEON-NEXT:    umull2 v16.2d, v0.4s, v16.4s
+; NEON-NEXT:    eor v19.16b, v19.16b, v11.16b
+; NEON-NEXT:    and v22.16b, v6.16b, v22.16b
+; NEON-NEXT:    xtn v18.2s, v12.2d
+; NEON-NEXT:    uzp1 v11.4s, v0.4s, v3.4s
+; NEON-NEXT:    and v23.16b, v6.16b, v28.16b
+; NEON-NEXT:    xtn v1.2s, v1.2d
+; NEON-NEXT:    umull2 v3.2d, v0.4s, v2.4s
+; NEON-NEXT:    eor v2.16b, v20.16b, v7.16b
+; NEON-NEXT:    ldp q12, q20, [sp, #144] // 32-byte Folded Reload
+; NEON-NEXT:    eor v4.16b, v17.16b, v4.16b
+; NEON-NEXT:    ldr q17, [sp, #96] // 16-byte Reload
+; NEON-NEXT:    umull v7.2d, v0.2s, v21.2s
+; NEON-NEXT:    ldr q21, [sp, #128] // 16-byte Reload
+; NEON-NEXT:    eor v16.16b, v19.16b, v16.16b
+; NEON-NEXT:    xtn v19.2s, v22.2d
+; NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-NEXT:    and v17.16b, v6.16b, v17.16b
+; NEON-NEXT:    and v20.16b, v6.16b, v20.16b
+; NEON-NEXT:    and v21.16b, v6.16b, v21.16b
+; NEON-NEXT:    umull v1.2d, v0.2s, v1.2s
+; NEON-NEXT:    umull2 v22.2d, v0.4s, v11.4s
+; NEON-NEXT:    ldr q11, [sp, #224] // 16-byte Reload
+; NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-NEXT:    and v12.16b, v6.16b, v12.16b
+; NEON-NEXT:    eor v2.16b, v25.16b, v2.16b
+; NEON-NEXT:    xtn v17.2s, v17.2d
+; NEON-NEXT:    xtn v20.2s, v20.2d
+; NEON-NEXT:    xtn v21.2s, v21.2d
+; NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-NEXT:    eor v11.16b, v11.16b, v14.16b
+; NEON-NEXT:    eor v7.16b, v15.16b, v7.16b
+; NEON-NEXT:    ldp d15, d14, [sp, #240] // 16-byte Folded Reload
+; NEON-NEXT:    eor v1.16b, v18.16b, v1.16b
+; NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-NEXT:    eor v16.16b, v16.16b, v22.16b
+; NEON-NEXT:    ldr q22, [sp, #112] // 16-byte Reload
+; NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; NEON-NEXT:    umull v18.2d, v0.2s, v20.2s
+; NEON-NEXT:    umull v20.2d, v0.2s, v21.2s
+; NEON-NEXT:    xtn v21.2s, v12.2d
+; NEON-NEXT:    eor v1.16b, v1.16b, v19.16b
+; NEON-NEXT:    and v22.16b, v6.16b, v22.16b
+; NEON-NEXT:    ldp q25, q19, [sp, #64] // 32-byte Folded Reload
+; NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v16.16b
+; NEON-NEXT:    ldr q16, [sp, #16] // 16-byte Reload
+; NEON-NEXT:    eor v7.16b, v11.16b, v7.16b
+; NEON-NEXT:    eor v1.16b, v1.16b, v17.16b
+; NEON-NEXT:    eor v17.16b, v18.16b, v20.16b
+; NEON-NEXT:    ldr q20, [sp, #48] // 16-byte Reload
+; NEON-NEXT:    umull v18.2d, v0.2s, v21.2s
+; NEON-NEXT:    and v16.16b, v6.16b, v16.16b
+; NEON-NEXT:    and v19.16b, v6.16b, v19.16b
+; NEON-NEXT:    xtn v22.2s, v22.2d
+; NEON-NEXT:    and v25.16b, v6.16b, v25.16b
+; NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-NEXT:    and v20.16b, v6.16b, v20.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v1.16b, v7.16b, v1.16b
+; NEON-NEXT:    xtn v16.2s, v16.2d
+; NEON-NEXT:    xtn v19.2s, v19.2d
+; NEON-NEXT:    xtn v21.2s, v25.2d
+; NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
+; NEON-NEXT:    xtn v18.2s, v20.2d
+; NEON-NEXT:    ldr q20, [sp, #32] // 16-byte Reload
+; NEON-NEXT:    umull v7.2d, v0.2s, v22.2s
+; NEON-NEXT:    ldr q22, [sp] // 16-byte Reload
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    and v25.16b, v6.16b, v10.16b
+; NEON-NEXT:    ldp d11, d10, [sp, #272] // 16-byte Folded Reload
+; NEON-NEXT:    and v20.16b, v6.16b, v20.16b
+; NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
+; NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
+; NEON-NEXT:    and v22.16b, v6.16b, v22.16b
+; NEON-NEXT:    eor v7.16b, v17.16b, v7.16b
+; NEON-NEXT:    xtn v17.2s, v20.2d
+; NEON-NEXT:    and v20.16b, v6.16b, v29.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-NEXT:    and v29.16b, v6.16b, v31.16b
+; NEON-NEXT:    xtn v22.2s, v22.2d
+; NEON-NEXT:    eor v7.16b, v7.16b, v16.16b
+; NEON-NEXT:    eor v16.16b, v19.16b, v21.16b
+; NEON-NEXT:    xtn v19.2s, v20.2d
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
+; NEON-NEXT:    xtn v20.2s, v25.2d
+; NEON-NEXT:    xtn v21.2s, v29.2d
+; NEON-NEXT:    and v25.16b, v6.16b, v9.16b
+; NEON-NEXT:    eor v16.16b, v16.16b, v18.16b
+; NEON-NEXT:    umull v18.2d, v0.2s, v22.2s
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-NEXT:    eor v22.16b, v4.16b, v3.16b
+; NEON-NEXT:    eor v4.16b, v1.16b, v7.16b
+; NEON-NEXT:    xtn v7.2s, v25.2d
+; NEON-NEXT:    and v1.16b, v6.16b, v13.16b
+; NEON-NEXT:    and v25.16b, v6.16b, v27.16b
+; NEON-NEXT:    ldp d13, d12, [sp, #256] // 16-byte Folded Reload
+; NEON-NEXT:    eor v16.16b, v16.16b, v17.16b
+; NEON-NEXT:    umull v17.2d, v0.2s, v20.2s
+; NEON-NEXT:    umull v20.2d, v0.2s, v21.2s
+; NEON-NEXT:    and v21.16b, v6.16b, v8.16b
+; NEON-NEXT:    ldp d9, d8, [sp, #288] // 16-byte Folded Reload
+; NEON-NEXT:    eor v22.16b, v22.16b, v3.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v18.16b, v18.16b, v19.16b
+; NEON-NEXT:    and v19.16b, v6.16b, v24.16b
+; NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
+; NEON-NEXT:    xtn v21.2s, v21.2d
+; NEON-NEXT:    xtn v1.2s, v1.2d
+; NEON-NEXT:    eor v22.16b, v22.16b, v3.16b
+; NEON-NEXT:    eor v16.16b, v16.16b, v17.16b
+; NEON-NEXT:    and v24.16b, v6.16b, v26.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v17.16b, v18.16b, v20.16b
+; NEON-NEXT:    xtn v20.2s, v25.2d
+; NEON-NEXT:    movi v25.2d, #0000000000000000
+; NEON-NEXT:    xtn v18.2s, v19.2d
+; NEON-NEXT:    xtn v19.2s, v23.2d
+; NEON-NEXT:    fmov v23.2d, #2.00000000
+; NEON-NEXT:    eor v22.16b, v22.16b, v3.16b
+; NEON-NEXT:    umull v1.2d, v0.2s, v1.2s
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v7.16b, v17.16b, v7.16b
+; NEON-NEXT:    umull v17.2d, v0.2s, v21.2s
+; NEON-NEXT:    and v21.16b, v6.16b, v30.16b
+; NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
+; NEON-NEXT:    fneg v25.2d, v25.2d
+; NEON-NEXT:    eor v22.16b, v22.16b, v3.16b
+; NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-NEXT:    and v26.16b, v6.16b, v23.16b
+; NEON-NEXT:    xtn v21.2s, v21.2d
+; NEON-NEXT:    eor v7.16b, v7.16b, v17.16b
+; NEON-NEXT:    eor v1.16b, v16.16b, v1.16b
+; NEON-NEXT:    xtn v16.2s, v24.2d
+; NEON-NEXT:    eor v17.16b, v22.16b, v3.16b
+; NEON-NEXT:    and v22.16b, v5.16b, v23.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    and v6.16b, v6.16b, v25.16b
+; NEON-NEXT:    and v5.16b, v5.16b, v25.16b
+; NEON-NEXT:    xtn v23.2s, v26.2d
+; NEON-NEXT:    eor v7.16b, v7.16b, v18.16b
+; NEON-NEXT:    eor v18.16b, v19.16b, v20.16b
+; NEON-NEXT:    umull v19.2d, v0.2s, v21.2s
+; NEON-NEXT:    eor v17.16b, v17.16b, v3.16b
+; NEON-NEXT:    uzp1 v20.4s, v0.4s, v22.4s
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
+; NEON-NEXT:    xtn v6.2s, v6.2d
+; NEON-NEXT:    uzp1 v5.4s, v0.4s, v5.4s
+; NEON-NEXT:    eor v1.16b, v4.16b, v1.16b
+; NEON-NEXT:    umull v21.2d, v0.2s, v23.2s
+; NEON-NEXT:    eor v17.16b, v17.16b, v3.16b
+; NEON-NEXT:    eor v18.16b, v18.16b, v19.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v4.16b, v7.16b, v16.16b
+; NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
+; NEON-NEXT:    umull2 v7.2d, v0.4s, v20.4s
+; NEON-NEXT:    umull2 v0.2d, v0.4s, v5.4s
+; NEON-NEXT:    eor v16.16b, v17.16b, v3.16b
+; NEON-NEXT:    eor v5.16b, v18.16b, v21.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-NEXT:    eor v1.16b, v1.16b, v4.16b
+; NEON-NEXT:    eor v4.16b, v16.16b, v7.16b
+; NEON-NEXT:    eor v3.16b, v5.16b, v6.16b
+; NEON-NEXT:    eor v2.16b, v2.16b, v0.16b
+; NEON-NEXT:    eor v0.16b, v1.16b, v3.16b
+; NEON-NEXT:    eor v1.16b, v2.16b, v4.16b
+; NEON-NEXT:    add sp, sp, #320
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v4i64_neon_zext:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    sub sp, sp, #320
+; NEON-AES-NEXT:    stp d15, d14, [sp, #240] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp d13, d12, [sp, #256] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp d11, d10, [sp, #272] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp d9, d8, [sp, #288] // 16-byte Folded Spill
+; NEON-AES-NEXT:    str x29, [sp, #304] // 8-byte Spill
+; NEON-AES-NEXT:    .cfi_def_cfa_offset 320
+; NEON-AES-NEXT:    .cfi_offset w29, -16
+; NEON-AES-NEXT:    .cfi_offset b8, -24
+; NEON-AES-NEXT:    .cfi_offset b9, -32
+; NEON-AES-NEXT:    .cfi_offset b10, -40
+; NEON-AES-NEXT:    .cfi_offset b11, -48
+; NEON-AES-NEXT:    .cfi_offset b12, -56
+; NEON-AES-NEXT:    .cfi_offset b13, -64
+; NEON-AES-NEXT:    .cfi_offset b14, -72
+; NEON-AES-NEXT:    .cfi_offset b15, -80
+; NEON-AES-NEXT:    mov w8, #2 // =0x2
+; NEON-AES-NEXT:    mov w9, #1 // =0x1
+; NEON-AES-NEXT:    ushll v6.2d, v1.2s, #0
+; NEON-AES-NEXT:    dup v23.2d, x8
+; NEON-AES-NEXT:    dup v22.2d, x9
+; NEON-AES-NEXT:    mov w8, #4 // =0x4
+; NEON-AES-NEXT:    mov w10, #16 // =0x10
+; NEON-AES-NEXT:    dup v28.2d, x8
+; NEON-AES-NEXT:    mov w9, #8 // =0x8
+; NEON-AES-NEXT:    dup v24.2d, x10
+; NEON-AES-NEXT:    ushll2 v5.2d, v1.4s, #0
+; NEON-AES-NEXT:    dup v25.2d, x9
+; NEON-AES-NEXT:    and v3.16b, v6.16b, v23.16b
+; NEON-AES-NEXT:    and v4.16b, v6.16b, v22.16b
+; NEON-AES-NEXT:    mov w8, #32 // =0x20
+; NEON-AES-NEXT:    and v7.16b, v6.16b, v28.16b
+; NEON-AES-NEXT:    dup v26.2d, x8
+; NEON-AES-NEXT:    mov w8, #64 // =0x40
+; NEON-AES-NEXT:    and v17.16b, v6.16b, v24.16b
+; NEON-AES-NEXT:    and v16.16b, v6.16b, v25.16b
+; NEON-AES-NEXT:    and v25.16b, v5.16b, v25.16b
+; NEON-AES-NEXT:    xtn v3.2s, v3.2d
+; NEON-AES-NEXT:    xtn v4.2s, v4.2d
+; NEON-AES-NEXT:    mov w9, #256 // =0x100
+; NEON-AES-NEXT:    xtn v19.2s, v7.2d
+; NEON-AES-NEXT:    dup v18.2d, x8
+; NEON-AES-NEXT:    mov w8, #128 // =0x80
+; NEON-AES-NEXT:    xtn v29.2s, v17.2d
+; NEON-AES-NEXT:    and v27.16b, v6.16b, v26.16b
+; NEON-AES-NEXT:    ldr x29, [sp, #304] // 8-byte Reload
+; NEON-AES-NEXT:    and v24.16b, v5.16b, v24.16b
+; NEON-AES-NEXT:    and v26.16b, v5.16b, v26.16b
+; NEON-AES-NEXT:    uzp1 v25.4s, v0.4s, v25.4s
+; NEON-AES-NEXT:    umull v3.2d, v0.2s, v3.2s
+; NEON-AES-NEXT:    umull v4.2d, v0.2s, v4.2s
+; NEON-AES-NEXT:    dup v1.2d, x8
+; NEON-AES-NEXT:    mov w8, #512 // =0x200
+; NEON-AES-NEXT:    dup v8.2d, x9
+; NEON-AES-NEXT:    mov w9, #2048 // =0x800
+; NEON-AES-NEXT:    umull v20.2d, v0.2s, v19.2s
+; NEON-AES-NEXT:    umull v19.2d, v0.2s, v29.2s
+; NEON-AES-NEXT:    uzp1 v29.4s, v0.4s, v24.4s
+; NEON-AES-NEXT:    uzp1 v26.4s, v0.4s, v26.4s
+; NEON-AES-NEXT:    and v30.16b, v5.16b, v18.16b
+; NEON-AES-NEXT:    dup v2.2d, x8
+; NEON-AES-NEXT:    mov w8, #1024 // =0x400
+; NEON-AES-NEXT:    stp q8, q1, [sp, #192] // 32-byte Folded Spill
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    and v4.16b, v5.16b, v22.16b
+; NEON-AES-NEXT:    and v22.16b, v5.16b, v28.16b
+; NEON-AES-NEXT:    dup v7.2d, x8
+; NEON-AES-NEXT:    mov w8, #4096 // =0x1000
+; NEON-AES-NEXT:    and v31.16b, v5.16b, v1.16b
+; NEON-AES-NEXT:    and v8.16b, v5.16b, v8.16b
+; NEON-AES-NEXT:    str q3, [sp, #224] // 16-byte Spill
+; NEON-AES-NEXT:    and v3.16b, v5.16b, v23.16b
+; NEON-AES-NEXT:    umull2 v9.2d, v0.4s, v25.4s
+; NEON-AES-NEXT:    str q7, [sp, #96] // 16-byte Spill
+; NEON-AES-NEXT:    uzp1 v4.4s, v0.4s, v4.4s
+; NEON-AES-NEXT:    uzp1 v28.4s, v0.4s, v22.4s
+; NEON-AES-NEXT:    xtn v21.2s, v16.2d
+; NEON-AES-NEXT:    dup v16.2d, x9
+; NEON-AES-NEXT:    mov w9, #32768 // =0x8000
+; NEON-AES-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
+; NEON-AES-NEXT:    dup v17.2d, x8
+; NEON-AES-NEXT:    mov w8, #8192 // =0x2000
+; NEON-AES-NEXT:    uzp1 v30.4s, v0.4s, v30.4s
+; NEON-AES-NEXT:    umull2 v29.2d, v0.4s, v29.4s
+; NEON-AES-NEXT:    umull2 v26.2d, v0.4s, v26.4s
+; NEON-AES-NEXT:    stp q16, q2, [sp, #160] // 32-byte Folded Spill
+; NEON-AES-NEXT:    umull2 v4.2d, v0.4s, v4.4s
+; NEON-AES-NEXT:    umull2 v28.2d, v0.4s, v28.4s
+; NEON-AES-NEXT:    uzp1 v31.4s, v0.4s, v31.4s
+; NEON-AES-NEXT:    uzp1 v8.4s, v0.4s, v8.4s
+; NEON-AES-NEXT:    dup v23.2d, x8
+; NEON-AES-NEXT:    mov w8, #16384 // =0x4000
+; NEON-AES-NEXT:    umull2 v3.2d, v0.4s, v3.4s
+; NEON-AES-NEXT:    and v10.16b, v5.16b, v17.16b
+; NEON-AES-NEXT:    umull2 v30.2d, v0.4s, v30.4s
+; NEON-AES-NEXT:    eor v29.16b, v29.16b, v26.16b
+; NEON-AES-NEXT:    dup v24.2d, x8
+; NEON-AES-NEXT:    mov w8, #65536 // =0x10000
+; NEON-AES-NEXT:    stp q17, q23, [sp, #128] // 32-byte Folded Spill
+; NEON-AES-NEXT:    eor v9.16b, v28.16b, v9.16b
+; NEON-AES-NEXT:    and v28.16b, v5.16b, v16.16b
+; NEON-AES-NEXT:    umull2 v31.2d, v0.4s, v31.4s
+; NEON-AES-NEXT:    umull2 v8.2d, v0.4s, v8.4s
+; NEON-AES-NEXT:    str q24, [sp, #112] // 16-byte Spill
+; NEON-AES-NEXT:    eor v3.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    and v4.16b, v5.16b, v2.16b
+; NEON-AES-NEXT:    uzp1 v10.4s, v0.4s, v10.4s
+; NEON-AES-NEXT:    and v13.16b, v5.16b, v23.16b
+; NEON-AES-NEXT:    dup v1.2d, x8
+; NEON-AES-NEXT:    mov w8, #131072 // =0x20000
+; NEON-AES-NEXT:    uzp1 v12.4s, v0.4s, v28.4s
+; NEON-AES-NEXT:    and v11.16b, v5.16b, v7.16b
+; NEON-AES-NEXT:    uzp1 v4.4s, v0.4s, v4.4s
+; NEON-AES-NEXT:    dup v17.2d, x8
+; NEON-AES-NEXT:    mov w8, #524288 // =0x80000
+; NEON-AES-NEXT:    eor v3.16b, v3.16b, v9.16b
+; NEON-AES-NEXT:    eor v29.16b, v29.16b, v30.16b
+; NEON-AES-NEXT:    uzp1 v9.4s, v0.4s, v13.4s
+; NEON-AES-NEXT:    dup v22.2d, x9
+; NEON-AES-NEXT:    mov w9, #262144 // =0x40000
+; NEON-AES-NEXT:    eor v31.16b, v31.16b, v8.16b
+; NEON-AES-NEXT:    stp q17, q1, [sp, #64] // 32-byte Folded Spill
+; NEON-AES-NEXT:    umull2 v8.2d, v0.4s, v12.4s
+; NEON-AES-NEXT:    umull2 v10.2d, v0.4s, v10.4s
+; NEON-AES-NEXT:    umull2 v4.2d, v0.4s, v4.4s
+; NEON-AES-NEXT:    dup v2.2d, x9
+; NEON-AES-NEXT:    mov w9, #536870912 // =0x20000000
+; NEON-AES-NEXT:    str q22, [sp, #16] // 16-byte Spill
+; NEON-AES-NEXT:    uzp1 v30.4s, v0.4s, v11.4s
+; NEON-AES-NEXT:    eor v7.16b, v3.16b, v29.16b
+; NEON-AES-NEXT:    and v29.16b, v5.16b, v24.16b
+; NEON-AES-NEXT:    and v11.16b, v5.16b, v1.16b
+; NEON-AES-NEXT:    and v12.16b, v5.16b, v17.16b
+; NEON-AES-NEXT:    dup v16.2d, x8
+; NEON-AES-NEXT:    mov w8, #4194304 // =0x400000
+; NEON-AES-NEXT:    eor v8.16b, v8.16b, v10.16b
+; NEON-AES-NEXT:    eor v4.16b, v31.16b, v4.16b
+; NEON-AES-NEXT:    umull2 v31.2d, v0.4s, v9.4s
+; NEON-AES-NEXT:    and v9.16b, v5.16b, v22.16b
+; NEON-AES-NEXT:    uzp1 v10.4s, v0.4s, v29.4s
+; NEON-AES-NEXT:    uzp1 v11.4s, v0.4s, v11.4s
+; NEON-AES-NEXT:    uzp1 v12.4s, v0.4s, v12.4s
+; NEON-AES-NEXT:    stp q16, q2, [sp, #32] // 32-byte Folded Spill
+; NEON-AES-NEXT:    and v13.16b, v5.16b, v2.16b
+; NEON-AES-NEXT:    xtn v14.2s, v27.2d
+; NEON-AES-NEXT:    umull2 v30.2d, v0.4s, v30.4s
+; NEON-AES-NEXT:    dup v1.2d, x8
+; NEON-AES-NEXT:    mov w8, #8388608 // =0x800000
+; NEON-AES-NEXT:    umull v21.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    uzp1 v9.4s, v0.4s, v9.4s
+; NEON-AES-NEXT:    eor v8.16b, v8.16b, v31.16b
+; NEON-AES-NEXT:    and v31.16b, v5.16b, v16.16b
+; NEON-AES-NEXT:    uzp1 v13.4s, v0.4s, v13.4s
+; NEON-AES-NEXT:    dup v29.2d, x8
+; NEON-AES-NEXT:    mov w8, #16777216 // =0x1000000
+; NEON-AES-NEXT:    str q1, [sp] // 16-byte Spill
+; NEON-AES-NEXT:    umull2 v10.2d, v0.4s, v10.4s
+; NEON-AES-NEXT:    umull2 v11.2d, v0.4s, v11.4s
+; NEON-AES-NEXT:    umull2 v12.2d, v0.4s, v12.4s
+; NEON-AES-NEXT:    eor v2.16b, v4.16b, v30.16b
+; NEON-AES-NEXT:    umull v15.2d, v0.2s, v14.2s
+; NEON-AES-NEXT:    uzp1 v4.4s, v0.4s, v31.4s
+; NEON-AES-NEXT:    dup v31.2d, x8
+; NEON-AES-NEXT:    mov w8, #1048576 // =0x100000
+; NEON-AES-NEXT:    and v3.16b, v5.16b, v1.16b
+; NEON-AES-NEXT:    and v1.16b, v5.16b, v29.16b
+; NEON-AES-NEXT:    umull2 v9.2d, v0.4s, v9.4s
+; NEON-AES-NEXT:    eor v14.16b, v20.16b, v21.16b
+; NEON-AES-NEXT:    umull2 v21.2d, v0.4s, v13.4s
+; NEON-AES-NEXT:    eor v20.16b, v8.16b, v10.16b
+; NEON-AES-NEXT:    eor v2.16b, v7.16b, v2.16b
+; NEON-AES-NEXT:    eor v7.16b, v11.16b, v12.16b
+; NEON-AES-NEXT:    eor v15.16b, v19.16b, v15.16b
+; NEON-AES-NEXT:    uzp1 v3.4s, v0.4s, v3.4s
+; NEON-AES-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
+; NEON-AES-NEXT:    and v19.16b, v5.16b, v31.16b
+; NEON-AES-NEXT:    umull2 v8.2d, v0.4s, v4.4s
+; NEON-AES-NEXT:    dup v10.2d, x8
+; NEON-AES-NEXT:    mov w8, #33554432 // =0x2000000
+; NEON-AES-NEXT:    eor v20.16b, v20.16b, v9.16b
+; NEON-AES-NEXT:    eor v7.16b, v7.16b, v21.16b
+; NEON-AES-NEXT:    dup v9.2d, x8
+; NEON-AES-NEXT:    mov w8, #67108864 // =0x4000000
+; NEON-AES-NEXT:    dup v28.2d, x9
+; NEON-AES-NEXT:    uzp1 v19.4s, v0.4s, v19.4s
+; NEON-AES-NEXT:    umull2 v11.2d, v0.4s, v3.4s
+; NEON-AES-NEXT:    umull2 v1.2d, v0.4s, v1.4s
+; NEON-AES-NEXT:    eor v25.16b, v2.16b, v20.16b
+; NEON-AES-NEXT:    movi v2.4s, #128, lsl #24
+; NEON-AES-NEXT:    eor v23.16b, v7.16b, v8.16b
+; NEON-AES-NEXT:    dup v8.2d, x8
+; NEON-AES-NEXT:    mov w8, #1073741824 // =0x40000000
+; NEON-AES-NEXT:    and v12.16b, v5.16b, v9.16b
+; NEON-AES-NEXT:    dup v27.2d, x8
+; NEON-AES-NEXT:    mov w8, #2097152 // =0x200000
+; NEON-AES-NEXT:    and v3.16b, v5.16b, v28.16b
+; NEON-AES-NEXT:    umull2 v7.2d, v0.4s, v19.4s
+; NEON-AES-NEXT:    eor v4.16b, v11.16b, v1.16b
+; NEON-AES-NEXT:    and v11.16b, v5.16b, v10.16b
+; NEON-AES-NEXT:    fneg v30.2d, v2.2d
+; NEON-AES-NEXT:    dup v13.2d, x8
+; NEON-AES-NEXT:    mov w8, #134217728 // =0x8000000
+; NEON-AES-NEXT:    and v1.16b, v5.16b, v27.16b
+; NEON-AES-NEXT:    and v2.16b, v5.16b, v8.16b
+; NEON-AES-NEXT:    dup v24.2d, x8
+; NEON-AES-NEXT:    mov w8, #268435456 // =0x10000000
+; NEON-AES-NEXT:    uzp1 v12.4s, v0.4s, v12.4s
+; NEON-AES-NEXT:    uzp1 v17.4s, v0.4s, v3.4s
+; NEON-AES-NEXT:    eor v19.16b, v4.16b, v7.16b
+; NEON-AES-NEXT:    uzp1 v7.4s, v0.4s, v11.4s
+; NEON-AES-NEXT:    and v3.16b, v5.16b, v13.16b
+; NEON-AES-NEXT:    uzp1 v16.4s, v0.4s, v1.4s
+; NEON-AES-NEXT:    uzp1 v11.4s, v0.4s, v2.4s
+; NEON-AES-NEXT:    and v1.16b, v5.16b, v30.16b
+; NEON-AES-NEXT:    and v2.16b, v5.16b, v24.16b
+; NEON-AES-NEXT:    movi v22.2d, #0000000000000000
+; NEON-AES-NEXT:    and v4.16b, v6.16b, v18.16b
+; NEON-AES-NEXT:    umull2 v12.2d, v0.4s, v12.4s
+; NEON-AES-NEXT:    umull2 v18.2d, v0.4s, v17.4s
+; NEON-AES-NEXT:    dup v26.2d, x8
+; NEON-AES-NEXT:    umull2 v20.2d, v0.4s, v7.4s
+; NEON-AES-NEXT:    uzp1 v7.4s, v0.4s, v3.4s
+; NEON-AES-NEXT:    uzp1 v3.4s, v0.4s, v1.4s
+; NEON-AES-NEXT:    umull2 v17.2d, v0.4s, v16.4s
+; NEON-AES-NEXT:    uzp1 v16.4s, v0.4s, v2.4s
+; NEON-AES-NEXT:    umull2 v11.2d, v0.4s, v11.4s
+; NEON-AES-NEXT:    ldp q1, q2, [sp, #192] // 32-byte Folded Reload
+; NEON-AES-NEXT:    xtn v21.2s, v4.2d
+; NEON-AES-NEXT:    eor v19.16b, v19.16b, v12.16b
+; NEON-AES-NEXT:    umull2 v7.2d, v0.4s, v7.4s
+; NEON-AES-NEXT:    umull2 v4.2d, v0.4s, v3.4s
+; NEON-AES-NEXT:    and v3.16b, v5.16b, v26.16b
+; NEON-AES-NEXT:    and v12.16b, v6.16b, v2.16b
+; NEON-AES-NEXT:    uzp1 v2.4s, v0.4s, v22.4s
+; NEON-AES-NEXT:    ldr q22, [sp, #176] // 16-byte Reload
+; NEON-AES-NEXT:    and v1.16b, v6.16b, v1.16b
+; NEON-AES-NEXT:    eor v20.16b, v23.16b, v20.16b
+; NEON-AES-NEXT:    eor v17.16b, v18.16b, v17.16b
+; NEON-AES-NEXT:    umull2 v16.2d, v0.4s, v16.4s
+; NEON-AES-NEXT:    eor v19.16b, v19.16b, v11.16b
+; NEON-AES-NEXT:    and v22.16b, v6.16b, v22.16b
+; NEON-AES-NEXT:    xtn v18.2s, v12.2d
+; NEON-AES-NEXT:    uzp1 v11.4s, v0.4s, v3.4s
+; NEON-AES-NEXT:    and v23.16b, v6.16b, v28.16b
+; NEON-AES-NEXT:    xtn v1.2s, v1.2d
+; NEON-AES-NEXT:    umull2 v3.2d, v0.4s, v2.4s
+; NEON-AES-NEXT:    eor v2.16b, v20.16b, v7.16b
+; NEON-AES-NEXT:    ldp q12, q20, [sp, #144] // 32-byte Folded Reload
+; NEON-AES-NEXT:    eor v4.16b, v17.16b, v4.16b
+; NEON-AES-NEXT:    ldr q17, [sp, #96] // 16-byte Reload
+; NEON-AES-NEXT:    umull v7.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    ldr q21, [sp, #128] // 16-byte Reload
+; NEON-AES-NEXT:    eor v16.16b, v19.16b, v16.16b
+; NEON-AES-NEXT:    xtn v19.2s, v22.2d
+; NEON-AES-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-AES-NEXT:    and v17.16b, v6.16b, v17.16b
+; NEON-AES-NEXT:    and v20.16b, v6.16b, v20.16b
+; NEON-AES-NEXT:    and v21.16b, v6.16b, v21.16b
+; NEON-AES-NEXT:    umull v1.2d, v0.2s, v1.2s
+; NEON-AES-NEXT:    umull2 v22.2d, v0.4s, v11.4s
+; NEON-AES-NEXT:    ldr q11, [sp, #224] // 16-byte Reload
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    and v12.16b, v6.16b, v12.16b
+; NEON-AES-NEXT:    eor v2.16b, v25.16b, v2.16b
+; NEON-AES-NEXT:    xtn v17.2s, v17.2d
+; NEON-AES-NEXT:    xtn v20.2s, v20.2d
+; NEON-AES-NEXT:    xtn v21.2s, v21.2d
+; NEON-AES-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-AES-NEXT:    eor v11.16b, v11.16b, v14.16b
+; NEON-AES-NEXT:    eor v7.16b, v15.16b, v7.16b
+; NEON-AES-NEXT:    ldp d15, d14, [sp, #240] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor v1.16b, v18.16b, v1.16b
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    eor v16.16b, v16.16b, v22.16b
+; NEON-AES-NEXT:    ldr q22, [sp, #112] // 16-byte Reload
+; NEON-AES-NEXT:    umull v17.2d, v0.2s, v17.2s
+; NEON-AES-NEXT:    umull v18.2d, v0.2s, v20.2s
+; NEON-AES-NEXT:    umull v20.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    xtn v21.2s, v12.2d
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v19.16b
+; NEON-AES-NEXT:    and v22.16b, v6.16b, v22.16b
+; NEON-AES-NEXT:    ldp q25, q19, [sp, #64] // 32-byte Folded Reload
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v16.16b
+; NEON-AES-NEXT:    ldr q16, [sp, #16] // 16-byte Reload
+; NEON-AES-NEXT:    eor v7.16b, v11.16b, v7.16b
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v17.16b
+; NEON-AES-NEXT:    eor v17.16b, v18.16b, v20.16b
+; NEON-AES-NEXT:    ldr q20, [sp, #48] // 16-byte Reload
+; NEON-AES-NEXT:    umull v18.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    and v16.16b, v6.16b, v16.16b
+; NEON-AES-NEXT:    and v19.16b, v6.16b, v19.16b
+; NEON-AES-NEXT:    xtn v22.2s, v22.2d
+; NEON-AES-NEXT:    and v25.16b, v6.16b, v25.16b
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    and v20.16b, v6.16b, v20.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v1.16b, v7.16b, v1.16b
+; NEON-AES-NEXT:    xtn v16.2s, v16.2d
+; NEON-AES-NEXT:    xtn v19.2s, v19.2d
+; NEON-AES-NEXT:    xtn v21.2s, v25.2d
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    eor v17.16b, v17.16b, v18.16b
+; NEON-AES-NEXT:    xtn v18.2s, v20.2d
+; NEON-AES-NEXT:    ldr q20, [sp, #32] // 16-byte Reload
+; NEON-AES-NEXT:    umull v7.2d, v0.2s, v22.2s
+; NEON-AES-NEXT:    ldr q22, [sp] // 16-byte Reload
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    and v25.16b, v6.16b, v10.16b
+; NEON-AES-NEXT:    ldp d11, d10, [sp, #272] // 16-byte Folded Reload
+; NEON-AES-NEXT:    and v20.16b, v6.16b, v20.16b
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    umull v16.2d, v0.2s, v16.2s
+; NEON-AES-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-AES-NEXT:    umull v21.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    and v22.16b, v6.16b, v22.16b
+; NEON-AES-NEXT:    eor v7.16b, v17.16b, v7.16b
+; NEON-AES-NEXT:    xtn v17.2s, v20.2d
+; NEON-AES-NEXT:    and v20.16b, v6.16b, v29.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-AES-NEXT:    and v29.16b, v6.16b, v31.16b
+; NEON-AES-NEXT:    xtn v22.2s, v22.2d
+; NEON-AES-NEXT:    eor v7.16b, v7.16b, v16.16b
+; NEON-AES-NEXT:    eor v16.16b, v19.16b, v21.16b
+; NEON-AES-NEXT:    xtn v19.2s, v20.2d
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v4.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    umull v17.2d, v0.2s, v17.2s
+; NEON-AES-NEXT:    xtn v20.2s, v25.2d
+; NEON-AES-NEXT:    xtn v21.2s, v29.2d
+; NEON-AES-NEXT:    and v25.16b, v6.16b, v9.16b
+; NEON-AES-NEXT:    eor v16.16b, v16.16b, v18.16b
+; NEON-AES-NEXT:    umull v18.2d, v0.2s, v22.2s
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-AES-NEXT:    eor v22.16b, v4.16b, v3.16b
+; NEON-AES-NEXT:    eor v4.16b, v1.16b, v7.16b
+; NEON-AES-NEXT:    xtn v7.2s, v25.2d
+; NEON-AES-NEXT:    and v1.16b, v6.16b, v13.16b
+; NEON-AES-NEXT:    and v25.16b, v6.16b, v27.16b
+; NEON-AES-NEXT:    ldp d13, d12, [sp, #256] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor v16.16b, v16.16b, v17.16b
+; NEON-AES-NEXT:    umull v17.2d, v0.2s, v20.2s
+; NEON-AES-NEXT:    umull v20.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    and v21.16b, v6.16b, v8.16b
+; NEON-AES-NEXT:    ldp d9, d8, [sp, #288] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor v22.16b, v22.16b, v3.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v18.16b, v18.16b, v19.16b
+; NEON-AES-NEXT:    and v19.16b, v6.16b, v24.16b
+; NEON-AES-NEXT:    umull v7.2d, v0.2s, v7.2s
+; NEON-AES-NEXT:    xtn v21.2s, v21.2d
+; NEON-AES-NEXT:    xtn v1.2s, v1.2d
+; NEON-AES-NEXT:    eor v22.16b, v22.16b, v3.16b
+; NEON-AES-NEXT:    eor v16.16b, v16.16b, v17.16b
+; NEON-AES-NEXT:    and v24.16b, v6.16b, v26.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v17.16b, v18.16b, v20.16b
+; NEON-AES-NEXT:    xtn v20.2s, v25.2d
+; NEON-AES-NEXT:    movi v25.2d, #0000000000000000
+; NEON-AES-NEXT:    xtn v18.2s, v19.2d
+; NEON-AES-NEXT:    xtn v19.2s, v23.2d
+; NEON-AES-NEXT:    fmov v23.2d, #2.00000000
+; NEON-AES-NEXT:    eor v22.16b, v22.16b, v3.16b
+; NEON-AES-NEXT:    umull v1.2d, v0.2s, v1.2s
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v7.16b, v17.16b, v7.16b
+; NEON-AES-NEXT:    umull v17.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    and v21.16b, v6.16b, v30.16b
+; NEON-AES-NEXT:    umull v20.2d, v0.2s, v20.2s
+; NEON-AES-NEXT:    fneg v25.2d, v25.2d
+; NEON-AES-NEXT:    eor v22.16b, v22.16b, v3.16b
+; NEON-AES-NEXT:    umull v18.2d, v0.2s, v18.2s
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    umull v19.2d, v0.2s, v19.2s
+; NEON-AES-NEXT:    and v26.16b, v6.16b, v23.16b
+; NEON-AES-NEXT:    xtn v21.2s, v21.2d
+; NEON-AES-NEXT:    eor v7.16b, v7.16b, v17.16b
+; NEON-AES-NEXT:    eor v1.16b, v16.16b, v1.16b
+; NEON-AES-NEXT:    xtn v16.2s, v24.2d
+; NEON-AES-NEXT:    eor v17.16b, v22.16b, v3.16b
+; NEON-AES-NEXT:    and v22.16b, v5.16b, v23.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    and v6.16b, v6.16b, v25.16b
+; NEON-AES-NEXT:    and v5.16b, v5.16b, v25.16b
+; NEON-AES-NEXT:    xtn v23.2s, v26.2d
+; NEON-AES-NEXT:    eor v7.16b, v7.16b, v18.16b
+; NEON-AES-NEXT:    eor v18.16b, v19.16b, v20.16b
+; NEON-AES-NEXT:    umull v19.2d, v0.2s, v21.2s
+; NEON-AES-NEXT:    eor v17.16b, v17.16b, v3.16b
+; NEON-AES-NEXT:    uzp1 v20.4s, v0.4s, v22.4s
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    umull v16.2d, v0.2s, v16.2s
+; NEON-AES-NEXT:    xtn v6.2s, v6.2d
+; NEON-AES-NEXT:    uzp1 v5.4s, v0.4s, v5.4s
+; NEON-AES-NEXT:    eor v1.16b, v4.16b, v1.16b
+; NEON-AES-NEXT:    umull v21.2d, v0.2s, v23.2s
+; NEON-AES-NEXT:    eor v17.16b, v17.16b, v3.16b
+; NEON-AES-NEXT:    eor v18.16b, v18.16b, v19.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v4.16b, v7.16b, v16.16b
+; NEON-AES-NEXT:    umull v6.2d, v0.2s, v6.2s
+; NEON-AES-NEXT:    umull2 v7.2d, v0.4s, v20.4s
+; NEON-AES-NEXT:    umull2 v0.2d, v0.4s, v5.4s
+; NEON-AES-NEXT:    eor v16.16b, v17.16b, v3.16b
+; NEON-AES-NEXT:    eor v5.16b, v18.16b, v21.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v3.16b
+; NEON-AES-NEXT:    eor v1.16b, v1.16b, v4.16b
+; NEON-AES-NEXT:    eor v4.16b, v16.16b, v7.16b
+; NEON-AES-NEXT:    eor v3.16b, v5.16b, v6.16b
+; NEON-AES-NEXT:    eor v2.16b, v2.16b, v0.16b
+; NEON-AES-NEXT:    eor v0.16b, v1.16b, v3.16b
+; NEON-AES-NEXT:    eor v1.16b, v2.16b, v4.16b
+; NEON-AES-NEXT:    add sp, sp, #320
+; NEON-AES-NEXT:    ret
+  %zextx = zext <4 x i32> %x to <4 x i64>
+  %zexty = zext <4 x i32> %y to <4 x i64>
+  %a = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %zextx, <4 x i64> %zexty)
+  ret <4 x i64> %a
+}
+
+define <1 x i128> @clmul_v1i128_neon_zext(<1 x i64> %x, <1 x i64> %y) {
+; NEON-LABEL: clmul_v1i128_neon_zext:
+; NEON:       // %bb.0:
+; NEON-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NEON-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NEON-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NEON-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NEON-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NEON-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NEON-NEXT:    sub sp, sp, #624
+; NEON-NEXT:    .cfi_def_cfa_offset 720
+; NEON-NEXT:    .cfi_offset w19, -8
+; NEON-NEXT:    .cfi_offset w20, -16
+; NEON-NEXT:    .cfi_offset w21, -24
+; NEON-NEXT:    .cfi_offset w22, -32
+; NEON-NEXT:    .cfi_offset w23, -40
+; NEON-NEXT:    .cfi_offset w24, -48
+; NEON-NEXT:    .cfi_offset w25, -56
+; NEON-NEXT:    .cfi_offset w26, -64
+; NEON-NEXT:    .cfi_offset w27, -72
+; NEON-NEXT:    .cfi_offset w28, -80
+; NEON-NEXT:    .cfi_offset w30, -88
+; NEON-NEXT:    .cfi_offset w29, -96
+; NEON-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NEON-NEXT:    fmov x10, d1
+; NEON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NEON-NEXT:    fmov x11, d0
+; NEON-NEXT:    and x8, x10, #0x2
+; NEON-NEXT:    mul x13, x11, x8
+; NEON-NEXT:    and x8, x10, #0x1
+; NEON-NEXT:    mul x14, x11, x8
+; NEON-NEXT:    and x8, x10, #0x4
+; NEON-NEXT:    mul x15, x11, x8
+; NEON-NEXT:    and x8, x10, #0x8
+; NEON-NEXT:    mul x16, x11, x8
+; NEON-NEXT:    and x8, x10, #0x10
+; NEON-NEXT:    mul x17, x11, x8
+; NEON-NEXT:    and x8, x10, #0x20
+; NEON-NEXT:    mul x18, x11, x8
+; NEON-NEXT:    and x8, x10, #0x40
+; NEON-NEXT:    mul x0, x11, x8
+; NEON-NEXT:    and x8, x10, #0x80
+; NEON-NEXT:    mul x1, x11, x8
+; NEON-NEXT:    and x8, x10, #0x100
+; NEON-NEXT:    mul x3, x11, x8
+; NEON-NEXT:    and x8, x10, #0x200
+; NEON-NEXT:    mul x2, x11, x8
+; NEON-NEXT:    and x8, x10, #0x400
+; NEON-NEXT:    mul x4, x11, x8
+; NEON-NEXT:    and x8, x10, #0x800
+; NEON-NEXT:    mul x5, x11, x8
+; NEON-NEXT:    and x8, x10, #0x1000
+; NEON-NEXT:    mul x20, x11, x8
+; NEON-NEXT:    and x8, x10, #0x2000
+; NEON-NEXT:    mul x6, x11, x8
+; NEON-NEXT:    and x8, x10, #0x4000
+; NEON-NEXT:    mul x7, x11, x8
+; NEON-NEXT:    and x8, x10, #0x8000
+; NEON-NEXT:    mul x19, x11, x8
+; NEON-NEXT:    and x8, x10, #0x10000
+; NEON-NEXT:    mul x21, x11, x8
+; NEON-NEXT:    and x8, x10, #0x20000
+; NEON-NEXT:    mul x22, x11, x8
+; NEON-NEXT:    and x8, x10, #0x40000
+; NEON-NEXT:    mul x23, x11, x8
+; NEON-NEXT:    and x8, x10, #0x80000
+; NEON-NEXT:    mul x24, x11, x8
+; NEON-NEXT:    and x8, x10, #0x100000
+; NEON-NEXT:    mul x25, x11, x8
+; NEON-NEXT:    and x8, x10, #0x200000
+; NEON-NEXT:    mul x26, x11, x8
+; NEON-NEXT:    and x8, x10, #0x400000
+; NEON-NEXT:    mul x27, x11, x8
+; NEON-NEXT:    and x8, x10, #0x800000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #592] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x1000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #584] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x2000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #616] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x4000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #576] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x8000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #608] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x10000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #600] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x20000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #568] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x40000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #512] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x80000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #536] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x100000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #528] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x200000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #560] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x400000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #520] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x800000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #552] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x1000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #544] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x2000000000
+; NEON-NEXT:    mul x9, x11, x8
+; NEON-NEXT:    and x8, x10, #0x4000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #440] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x8000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #464] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x10000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #456] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x20000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    stp x8, x9, [sp, #488] // 16-byte Folded Spill
+; NEON-NEXT:    and x8, x10, #0x40000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #448] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x80000000000
+; NEON-NEXT:    mul x9, x11, x8
+; NEON-NEXT:    and x8, x10, #0x100000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    stp x8, x9, [sp, #472] // 16-byte Folded Spill
+; NEON-NEXT:    and x8, x10, #0x200000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #504] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x400000000000
+; NEON-NEXT:    mul x9, x11, x8
+; NEON-NEXT:    and x8, x10, #0x800000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #392] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x1000000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #416] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x2000000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #408] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x4000000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    stp x8, x9, [sp, #424] // 16-byte Folded Spill
+; NEON-NEXT:    and x8, x10, #0x8000000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    str x8, [sp, #400] // 8-byte Spill
+; NEON-NEXT:    and x8, x10, #0x100000000000000
+; NEON-NEXT:    mul x9, x11, x8
+; NEON-NEXT:    and x8, x10, #0x200000000000000
+; NEON-NEXT:    mul x8, x11, x8
+; NEON-NEXT:    stp x8, x9, [sp, #376] // 16-byte Folded Spill
+; NEON-NEXT:    and x9, x10, #0x400000000000000
+; NEON-NEXT:    rbit x8, x10
+; NEON-NEXT:    mul x9, x11, x9
+; NEON-NEXT:    and x12, x8, #0x2
+; NEON-NEXT:    str x9, [sp, #368] // 8-byte Spill
+; NEON-NEXT:    rbit x9, x11
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #360] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x1
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #352] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x4
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #344] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x8
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #336] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x10
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #328] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x20
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #320] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x40
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #312] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x80
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #304] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x100
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #296] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x200
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #288] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x400
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #280] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x800
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #272] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x1000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #256] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x2000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #248] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x4000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #264] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x8000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #240] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x10000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #232] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x20000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #200] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x40000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #224] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x80000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #192] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x100000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #216] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x200000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #208] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x400000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #184] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x800000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #136] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x1000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #168] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x2000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #160] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x4000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #176] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x8000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #152] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x10000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #144] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x20000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #128] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x40000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #120] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x80000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #112] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x100000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #104] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x200000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #96] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x400000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #88] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x800000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #80] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x1000000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #72] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x2000000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #64] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x4000000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #56] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x8000000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #48] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x10000000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #40] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x20000000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #32] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x40000000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #24] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x80000000000
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    str x12, [sp, #16] // 8-byte Spill
+; NEON-NEXT:    eor x12, x14, x13
+; NEON-NEXT:    and x14, x8, #0x100000000000
+; NEON-NEXT:    mul x14, x9, x14
+; NEON-NEXT:    eor x13, x15, x16
+; NEON-NEXT:    and x15, x8, #0x200000000000
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    eor x13, x17, x18
+; NEON-NEXT:    ldr x16, [sp, #608] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x0
+; NEON-NEXT:    mul x30, x9, x15
+; NEON-NEXT:    and x15, x8, #0x400000000000
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    mul x29, x9, x15
+; NEON-NEXT:    and x15, x8, #0x800000000000
+; NEON-NEXT:    str x14, [sp, #8] // 8-byte Spill
+; NEON-NEXT:    eor x14, x1, x3
+; NEON-NEXT:    eor x13, x14, x2
+; NEON-NEXT:    eor x14, x5, x20
+; NEON-NEXT:    mul x28, x9, x15
+; NEON-NEXT:    eor x13, x13, x4
+; NEON-NEXT:    ldr x15, [sp, #592] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    eor x13, x14, x6
+; NEON-NEXT:    eor x14, x21, x22
+; NEON-NEXT:    eor x13, x13, x7
+; NEON-NEXT:    eor x14, x14, x23
+; NEON-NEXT:    eor x15, x27, x15
+; NEON-NEXT:    eor x13, x13, x19
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    eor x13, x14, x24
+; NEON-NEXT:    and x14, x8, #0x1000000000000
+; NEON-NEXT:    eor x13, x13, x25
+; NEON-NEXT:    mul x27, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #584] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x26
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #616] // 8-byte Reload
+; NEON-NEXT:    eor x14, x15, x14
+; NEON-NEXT:    and x15, x8, #0x2000000000000
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    ldr x14, [sp, #576] // 8-byte Reload
+; NEON-NEXT:    mul x25, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #512] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #568] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #536] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    and x15, x8, #0x4000000000000
+; NEON-NEXT:    mul x24, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #600] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    ldr x16, [sp, #552] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #528] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #560] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    and x15, x8, #0x8000000000000
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    ldr x14, [sp, #520] // 8-byte Reload
+; NEON-NEXT:    mul x23, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #440] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #496] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #464] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    and x15, x8, #0x10000000000000
+; NEON-NEXT:    mul x21, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #544] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    ldr x16, [sp, #256] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #456] // 8-byte Reload
+; NEON-NEXT:    eor x26, x12, x13
+; NEON-NEXT:    ldr x12, [sp, #488] // 8-byte Reload
+; NEON-NEXT:    ldr x13, [sp, #448] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    and x15, x8, #0x20000000000000
+; NEON-NEXT:    eor x12, x14, x12
+; NEON-NEXT:    mul x20, x9, x15
+; NEON-NEXT:    ldr x14, [sp, #392] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #432] // 8-byte Reload
+; NEON-NEXT:    ldr x15, [sp, #480] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    and x14, x8, #0x40000000000000
+; NEON-NEXT:    eor x12, x12, x15
+; NEON-NEXT:    ldr x15, [sp, #416] // 8-byte Reload
+; NEON-NEXT:    mul x7, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #472] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #504] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x14
+; NEON-NEXT:    ldr x14, [sp, #408] // 8-byte Reload
+; NEON-NEXT:    eor x22, x12, x15
+; NEON-NEXT:    ldr x12, [sp, #424] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    and x14, x8, #0x80000000000000
+; NEON-NEXT:    eor x12, x13, x12
+; NEON-NEXT:    ldr x13, [sp, #400] // 8-byte Reload
+; NEON-NEXT:    mul x5, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #368] // 8-byte Reload
+; NEON-NEXT:    eor x19, x12, x13
+; NEON-NEXT:    ldp x13, x12, [sp, #376] // 16-byte Folded Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    and x13, x8, #0x100000000000000
+; NEON-NEXT:    eor x6, x12, x14
+; NEON-NEXT:    ldp x14, x12, [sp, #352] // 16-byte Folded Reload
+; NEON-NEXT:    mul x4, x9, x13
+; NEON-NEXT:    eor x12, x14, x12
+; NEON-NEXT:    ldp x14, x13, [sp, #336] // 16-byte Folded Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldp x15, x14, [sp, #320] // 16-byte Folded Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #312] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    and x15, x8, #0x200000000000000
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    mul x3, x9, x15
+; NEON-NEXT:    and x15, x8, #0x400000000000000
+; NEON-NEXT:    eor x14, x12, x13
+; NEON-NEXT:    ldp x13, x12, [sp, #296] // 16-byte Folded Reload
+; NEON-NEXT:    mul x2, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #280] // 8-byte Reload
+; NEON-NEXT:    eor x13, x12, x13
+; NEON-NEXT:    ldr x12, [sp, #288] // 8-byte Reload
+; NEON-NEXT:    eor x3, x4, x3
+; NEON-NEXT:    and x4, x10, #0x2000000000000000
+; NEON-NEXT:    eor x13, x13, x12
+; NEON-NEXT:    ldr x12, [sp, #272] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #248] // 8-byte Reload
+; NEON-NEXT:    eor x2, x3, x2
+; NEON-NEXT:    eor x12, x12, x16
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    ldr x14, [sp, #264] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x15
+; NEON-NEXT:    and x15, x8, #0x800000000000000
+; NEON-NEXT:    mul x3, x11, x4
+; NEON-NEXT:    eor x14, x12, x14
+; NEON-NEXT:    mul x1, x9, x15
+; NEON-NEXT:    ldp x12, x15, [sp, #232] // 16-byte Folded Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    ldr x15, [sp, #200] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldp x16, x14, [sp, #216] // 16-byte Folded Reload
+; NEON-NEXT:    eor x15, x12, x15
+; NEON-NEXT:    and x12, x8, #0x1000000000000000
+; NEON-NEXT:    eor x1, x2, x1
+; NEON-NEXT:    mul x0, x9, x12
+; NEON-NEXT:    and x2, x10, #0x4000000000000000
+; NEON-NEXT:    eor x14, x15, x14
+; NEON-NEXT:    ldp x12, x15, [sp, #184] // 16-byte Folded Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    ldr x15, [sp, #136] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    ldr x16, [sp, #168] // 8-byte Reload
+; NEON-NEXT:    eor x0, x1, x0
+; NEON-NEXT:    eor x15, x12, x15
+; NEON-NEXT:    and x12, x8, #0x2000000000000000
+; NEON-NEXT:    and x8, x8, #0x4000000000000000
+; NEON-NEXT:    mul x18, x9, x12
+; NEON-NEXT:    ldr x12, [sp, #208] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    eor x14, x14, x12
+; NEON-NEXT:    ldr x12, [sp, #160] // 8-byte Reload
+; NEON-NEXT:    mul x17, x9, x8
+; NEON-NEXT:    ldr x8, [sp, #152] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    eor x15, x15, x12
+; NEON-NEXT:    ldr x12, [sp, #176] // 8-byte Reload
+; NEON-NEXT:    mul x1, x11, x2
+; NEON-NEXT:    eor x18, x0, x18
+; NEON-NEXT:    eor x14, x15, x12
+; NEON-NEXT:    and x15, x10, #0x10000000000000
+; NEON-NEXT:    eor x9, x14, x8
+; NEON-NEXT:    ldp x12, x8, [sp, #120] // 16-byte Folded Reload
+; NEON-NEXT:    mul x16, x11, x15
+; NEON-NEXT:    eor x14, x8, x12
+; NEON-NEXT:    ldr x8, [sp, #144] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x8
+; NEON-NEXT:    ldr x8, [sp, #112] // 8-byte Reload
+; NEON-NEXT:    eor x9, x13, x9
+; NEON-NEXT:    eor x14, x14, x8
+; NEON-NEXT:    ldr x8, [sp, #104] // 8-byte Reload
+; NEON-NEXT:    eor x13, x14, x8
+; NEON-NEXT:    ldr x8, [sp, #96] // 8-byte Reload
+; NEON-NEXT:    and x14, x10, #0x20000000000000
+; NEON-NEXT:    mul x15, x11, x14
+; NEON-NEXT:    eor x13, x13, x8
+; NEON-NEXT:    ldp x12, x8, [sp, #56] // 16-byte Folded Reload
+; NEON-NEXT:    eor x8, x8, x12
+; NEON-NEXT:    ldp x14, x12, [sp, #80] // 16-byte Folded Reload
+; NEON-NEXT:    eor x13, x13, x12
+; NEON-NEXT:    ldr x12, [sp, #48] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #40] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x12
+; NEON-NEXT:    and x12, x10, #0x40000000000000
+; NEON-NEXT:    eor x8, x8, x14
+; NEON-NEXT:    mul x14, x11, x12
+; NEON-NEXT:    ldr x12, [sp, #72] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x12
+; NEON-NEXT:    ldr x12, [sp, #32] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x13
+; NEON-NEXT:    ldr x13, [sp, #24] // 8-byte Reload
+; NEON-NEXT:    eor x8, x8, x12
+; NEON-NEXT:    and x12, x10, #0x80000000000000
+; NEON-NEXT:    eor x8, x8, x13
+; NEON-NEXT:    ldr x13, [sp, #16] // 8-byte Reload
+; NEON-NEXT:    mul x12, x11, x12
+; NEON-NEXT:    eor x8, x8, x13
+; NEON-NEXT:    eor x13, x29, x28
+; NEON-NEXT:    ldr x29, [sp, #8] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x27
+; NEON-NEXT:    and x28, x10, #0x800000000000000
+; NEON-NEXT:    eor x8, x8, x29
+; NEON-NEXT:    eor x13, x13, x25
+; NEON-NEXT:    mul x27, x11, x28
+; NEON-NEXT:    eor x8, x8, x30
+; NEON-NEXT:    and x25, x10, #0x1000000000000000
+; NEON-NEXT:    and x10, x10, #0x8000000000000000
+; NEON-NEXT:    eor x8, x9, x8
+; NEON-NEXT:    eor x9, x13, x24
+; NEON-NEXT:    mul x13, x11, x25
+; NEON-NEXT:    eor x9, x9, x23
+; NEON-NEXT:    eor x9, x9, x21
+; NEON-NEXT:    mul x10, x11, x10
+; NEON-NEXT:    eor x11, x19, x16
+; NEON-NEXT:    eor x9, x9, x20
+; NEON-NEXT:    eor x16, x6, x27
+; NEON-NEXT:    eor x9, x9, x7
+; NEON-NEXT:    eor x9, x9, x5
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    eor x9, x18, x17
+; NEON-NEXT:    eor x8, x8, x9
+; NEON-NEXT:    eor x9, x11, x15
+; NEON-NEXT:    eor x11, x16, x13
+; NEON-NEXT:    rbit x8, x8
+; NEON-NEXT:    eor x9, x9, x14
+; NEON-NEXT:    eor x11, x11, x3
+; NEON-NEXT:    eor x13, x26, x22
+; NEON-NEXT:    eor x9, x9, x12
+; NEON-NEXT:    eor x11, x11, x1
+; NEON-NEXT:    lsr x1, x8, #1
+; NEON-NEXT:    eor x8, x13, x9
+; NEON-NEXT:    eor x9, x11, x10
+; NEON-NEXT:    eor x0, x8, x9
+; NEON-NEXT:    add sp, sp, #624
+; NEON-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v1i128_neon_zext:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NEON-AES-NEXT:    sub sp, sp, #624
+; NEON-AES-NEXT:    .cfi_def_cfa_offset 720
+; NEON-AES-NEXT:    .cfi_offset w19, -8
+; NEON-AES-NEXT:    .cfi_offset w20, -16
+; NEON-AES-NEXT:    .cfi_offset w21, -24
+; NEON-AES-NEXT:    .cfi_offset w22, -32
+; NEON-AES-NEXT:    .cfi_offset w23, -40
+; NEON-AES-NEXT:    .cfi_offset w24, -48
+; NEON-AES-NEXT:    .cfi_offset w25, -56
+; NEON-AES-NEXT:    .cfi_offset w26, -64
+; NEON-AES-NEXT:    .cfi_offset w27, -72
+; NEON-AES-NEXT:    .cfi_offset w28, -80
+; NEON-AES-NEXT:    .cfi_offset w30, -88
+; NEON-AES-NEXT:    .cfi_offset w29, -96
+; NEON-AES-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NEON-AES-NEXT:    fmov x10, d1
+; NEON-AES-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NEON-AES-NEXT:    fmov x11, d0
+; NEON-AES-NEXT:    and x8, x10, #0x2
+; NEON-AES-NEXT:    mul x13, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x1
+; NEON-AES-NEXT:    mul x14, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x4
+; NEON-AES-NEXT:    mul x15, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x8
+; NEON-AES-NEXT:    mul x16, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x10
+; NEON-AES-NEXT:    mul x17, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x20
+; NEON-AES-NEXT:    mul x18, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x40
+; NEON-AES-NEXT:    mul x0, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x80
+; NEON-AES-NEXT:    mul x1, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x100
+; NEON-AES-NEXT:    mul x3, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x200
+; NEON-AES-NEXT:    mul x2, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x400
+; NEON-AES-NEXT:    mul x4, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x800
+; NEON-AES-NEXT:    mul x5, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x1000
+; NEON-AES-NEXT:    mul x20, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x2000
+; NEON-AES-NEXT:    mul x6, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x4000
+; NEON-AES-NEXT:    mul x7, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x8000
+; NEON-AES-NEXT:    mul x19, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x10000
+; NEON-AES-NEXT:    mul x21, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x20000
+; NEON-AES-NEXT:    mul x22, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x40000
+; NEON-AES-NEXT:    mul x23, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x80000
+; NEON-AES-NEXT:    mul x24, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x100000
+; NEON-AES-NEXT:    mul x25, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x200000
+; NEON-AES-NEXT:    mul x26, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x400000
+; NEON-AES-NEXT:    mul x27, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x800000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #592] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x1000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #584] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x2000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #616] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x4000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #576] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x8000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #608] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x10000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #600] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x20000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #568] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x40000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #512] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x80000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #536] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x100000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #528] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x200000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #560] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x400000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #520] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x800000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #552] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x1000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #544] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x2000000000
+; NEON-AES-NEXT:    mul x9, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x4000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #440] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x8000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #464] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x10000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #456] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x20000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    stp x8, x9, [sp, #488] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x8, x10, #0x40000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #448] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x80000000000
+; NEON-AES-NEXT:    mul x9, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x100000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    stp x8, x9, [sp, #472] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x8, x10, #0x200000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #504] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x400000000000
+; NEON-AES-NEXT:    mul x9, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x800000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #392] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x1000000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #416] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x2000000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #408] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x4000000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    stp x8, x9, [sp, #424] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x8, x10, #0x8000000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    str x8, [sp, #400] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x10, #0x100000000000000
+; NEON-AES-NEXT:    mul x9, x11, x8
+; NEON-AES-NEXT:    and x8, x10, #0x200000000000000
+; NEON-AES-NEXT:    mul x8, x11, x8
+; NEON-AES-NEXT:    stp x8, x9, [sp, #376] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x9, x10, #0x400000000000000
+; NEON-AES-NEXT:    rbit x8, x10
+; NEON-AES-NEXT:    mul x9, x11, x9
+; NEON-AES-NEXT:    and x12, x8, #0x2
+; NEON-AES-NEXT:    str x9, [sp, #368] // 8-byte Spill
+; NEON-AES-NEXT:    rbit x9, x11
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #360] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x1
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #352] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x4
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #344] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x8
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #336] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x10
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #328] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x20
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #320] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x40
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #312] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x80
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #304] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x100
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #296] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x200
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #288] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x400
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #280] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x800
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #272] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x1000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #256] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x2000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #248] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x4000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #264] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x8000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #240] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x10000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #232] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x20000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #200] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x40000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #224] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x80000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #192] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x100000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #216] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x200000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #208] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x400000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #184] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x800000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #136] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x1000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #168] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x2000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #160] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x4000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #176] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x8000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #152] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x10000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #144] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x20000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #128] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x40000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #120] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x80000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #112] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x100000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #104] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x200000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #96] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x400000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #88] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x800000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #80] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x1000000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #72] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x2000000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #64] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x4000000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #56] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x8000000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #48] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x10000000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #40] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x20000000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #32] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x40000000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #24] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x80000000000
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    str x12, [sp, #16] // 8-byte Spill
+; NEON-AES-NEXT:    eor x12, x14, x13
+; NEON-AES-NEXT:    and x14, x8, #0x100000000000
+; NEON-AES-NEXT:    mul x14, x9, x14
+; NEON-AES-NEXT:    eor x13, x15, x16
+; NEON-AES-NEXT:    and x15, x8, #0x200000000000
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    eor x13, x17, x18
+; NEON-AES-NEXT:    ldr x16, [sp, #608] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x0
+; NEON-AES-NEXT:    mul x30, x9, x15
+; NEON-AES-NEXT:    and x15, x8, #0x400000000000
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    mul x29, x9, x15
+; NEON-AES-NEXT:    and x15, x8, #0x800000000000
+; NEON-AES-NEXT:    str x14, [sp, #8] // 8-byte Spill
+; NEON-AES-NEXT:    eor x14, x1, x3
+; NEON-AES-NEXT:    eor x13, x14, x2
+; NEON-AES-NEXT:    eor x14, x5, x20
+; NEON-AES-NEXT:    mul x28, x9, x15
+; NEON-AES-NEXT:    eor x13, x13, x4
+; NEON-AES-NEXT:    ldr x15, [sp, #592] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    eor x13, x14, x6
+; NEON-AES-NEXT:    eor x14, x21, x22
+; NEON-AES-NEXT:    eor x13, x13, x7
+; NEON-AES-NEXT:    eor x14, x14, x23
+; NEON-AES-NEXT:    eor x15, x27, x15
+; NEON-AES-NEXT:    eor x13, x13, x19
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    eor x13, x14, x24
+; NEON-AES-NEXT:    and x14, x8, #0x1000000000000
+; NEON-AES-NEXT:    eor x13, x13, x25
+; NEON-AES-NEXT:    mul x27, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #584] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x26
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #616] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x15, x14
+; NEON-AES-NEXT:    and x15, x8, #0x2000000000000
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    ldr x14, [sp, #576] // 8-byte Reload
+; NEON-AES-NEXT:    mul x25, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #512] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #568] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #536] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    and x15, x8, #0x4000000000000
+; NEON-AES-NEXT:    mul x24, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #600] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #552] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #528] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #560] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    and x15, x8, #0x8000000000000
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    ldr x14, [sp, #520] // 8-byte Reload
+; NEON-AES-NEXT:    mul x23, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #440] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #496] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #464] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    and x15, x8, #0x10000000000000
+; NEON-AES-NEXT:    mul x21, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #544] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #256] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #456] // 8-byte Reload
+; NEON-AES-NEXT:    eor x26, x12, x13
+; NEON-AES-NEXT:    ldr x12, [sp, #488] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x13, [sp, #448] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    and x15, x8, #0x20000000000000
+; NEON-AES-NEXT:    eor x12, x14, x12
+; NEON-AES-NEXT:    mul x20, x9, x15
+; NEON-AES-NEXT:    ldr x14, [sp, #392] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #432] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x15, [sp, #480] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    and x14, x8, #0x40000000000000
+; NEON-AES-NEXT:    eor x12, x12, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #416] // 8-byte Reload
+; NEON-AES-NEXT:    mul x7, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #472] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #504] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #408] // 8-byte Reload
+; NEON-AES-NEXT:    eor x22, x12, x15
+; NEON-AES-NEXT:    ldr x12, [sp, #424] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    and x14, x8, #0x80000000000000
+; NEON-AES-NEXT:    eor x12, x13, x12
+; NEON-AES-NEXT:    ldr x13, [sp, #400] // 8-byte Reload
+; NEON-AES-NEXT:    mul x5, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #368] // 8-byte Reload
+; NEON-AES-NEXT:    eor x19, x12, x13
+; NEON-AES-NEXT:    ldp x13, x12, [sp, #376] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    and x13, x8, #0x100000000000000
+; NEON-AES-NEXT:    eor x6, x12, x14
+; NEON-AES-NEXT:    ldp x14, x12, [sp, #352] // 16-byte Folded Reload
+; NEON-AES-NEXT:    mul x4, x9, x13
+; NEON-AES-NEXT:    eor x12, x14, x12
+; NEON-AES-NEXT:    ldp x14, x13, [sp, #336] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldp x15, x14, [sp, #320] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #312] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    and x15, x8, #0x200000000000000
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    mul x3, x9, x15
+; NEON-AES-NEXT:    and x15, x8, #0x400000000000000
+; NEON-AES-NEXT:    eor x14, x12, x13
+; NEON-AES-NEXT:    ldp x13, x12, [sp, #296] // 16-byte Folded Reload
+; NEON-AES-NEXT:    mul x2, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #280] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x12, x13
+; NEON-AES-NEXT:    ldr x12, [sp, #288] // 8-byte Reload
+; NEON-AES-NEXT:    eor x3, x4, x3
+; NEON-AES-NEXT:    and x4, x10, #0x2000000000000000
+; NEON-AES-NEXT:    eor x13, x13, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #272] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #248] // 8-byte Reload
+; NEON-AES-NEXT:    eor x2, x3, x2
+; NEON-AES-NEXT:    eor x12, x12, x16
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    ldr x14, [sp, #264] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x15
+; NEON-AES-NEXT:    and x15, x8, #0x800000000000000
+; NEON-AES-NEXT:    mul x3, x11, x4
+; NEON-AES-NEXT:    eor x14, x12, x14
+; NEON-AES-NEXT:    mul x1, x9, x15
+; NEON-AES-NEXT:    ldp x12, x15, [sp, #232] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #200] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldp x16, x14, [sp, #216] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x15, x12, x15
+; NEON-AES-NEXT:    and x12, x8, #0x1000000000000000
+; NEON-AES-NEXT:    eor x1, x2, x1
+; NEON-AES-NEXT:    mul x0, x9, x12
+; NEON-AES-NEXT:    and x2, x10, #0x4000000000000000
+; NEON-AES-NEXT:    eor x14, x15, x14
+; NEON-AES-NEXT:    ldp x12, x15, [sp, #184] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #136] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #168] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x1, x0
+; NEON-AES-NEXT:    eor x15, x12, x15
+; NEON-AES-NEXT:    and x12, x8, #0x2000000000000000
+; NEON-AES-NEXT:    and x8, x8, #0x4000000000000000
+; NEON-AES-NEXT:    mul x18, x9, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #208] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    eor x14, x14, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #160] // 8-byte Reload
+; NEON-AES-NEXT:    mul x17, x9, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #152] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    eor x15, x15, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #176] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x11, x2
+; NEON-AES-NEXT:    eor x18, x0, x18
+; NEON-AES-NEXT:    eor x14, x15, x12
+; NEON-AES-NEXT:    and x15, x10, #0x10000000000000
+; NEON-AES-NEXT:    eor x9, x14, x8
+; NEON-AES-NEXT:    ldp x12, x8, [sp, #120] // 16-byte Folded Reload
+; NEON-AES-NEXT:    mul x16, x11, x15
+; NEON-AES-NEXT:    eor x14, x8, x12
+; NEON-AES-NEXT:    ldr x8, [sp, #144] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #112] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x13, x9
+; NEON-AES-NEXT:    eor x14, x14, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #104] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x14, x8
+; NEON-AES-NEXT:    ldr x8, [sp, #96] // 8-byte Reload
+; NEON-AES-NEXT:    and x14, x10, #0x20000000000000
+; NEON-AES-NEXT:    mul x15, x11, x14
+; NEON-AES-NEXT:    eor x13, x13, x8
+; NEON-AES-NEXT:    ldp x12, x8, [sp, #56] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x8, x8, x12
+; NEON-AES-NEXT:    ldp x14, x12, [sp, #80] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x13, x13, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #48] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #40] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x12
+; NEON-AES-NEXT:    and x12, x10, #0x40000000000000
+; NEON-AES-NEXT:    eor x8, x8, x14
+; NEON-AES-NEXT:    mul x14, x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #72] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #32] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #24] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x8, x12
+; NEON-AES-NEXT:    and x12, x10, #0x80000000000000
+; NEON-AES-NEXT:    eor x8, x8, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #16] // 8-byte Reload
+; NEON-AES-NEXT:    mul x12, x11, x12
+; NEON-AES-NEXT:    eor x8, x8, x13
+; NEON-AES-NEXT:    eor x13, x29, x28
+; NEON-AES-NEXT:    ldr x29, [sp, #8] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x27
+; NEON-AES-NEXT:    and x28, x10, #0x800000000000000
+; NEON-AES-NEXT:    eor x8, x8, x29
+; NEON-AES-NEXT:    eor x13, x13, x25
+; NEON-AES-NEXT:    mul x27, x11, x28
+; NEON-AES-NEXT:    eor x8, x8, x30
+; NEON-AES-NEXT:    and x25, x10, #0x1000000000000000
+; NEON-AES-NEXT:    and x10, x10, #0x8000000000000000
+; NEON-AES-NEXT:    eor x8, x9, x8
+; NEON-AES-NEXT:    eor x9, x13, x24
+; NEON-AES-NEXT:    mul x13, x11, x25
+; NEON-AES-NEXT:    eor x9, x9, x23
+; NEON-AES-NEXT:    eor x9, x9, x21
+; NEON-AES-NEXT:    mul x10, x11, x10
+; NEON-AES-NEXT:    eor x11, x19, x16
+; NEON-AES-NEXT:    eor x9, x9, x20
+; NEON-AES-NEXT:    eor x16, x6, x27
+; NEON-AES-NEXT:    eor x9, x9, x7
+; NEON-AES-NEXT:    eor x9, x9, x5
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    eor x9, x18, x17
+; NEON-AES-NEXT:    eor x8, x8, x9
+; NEON-AES-NEXT:    eor x9, x11, x15
+; NEON-AES-NEXT:    eor x11, x16, x13
+; NEON-AES-NEXT:    rbit x8, x8
+; NEON-AES-NEXT:    eor x9, x9, x14
+; NEON-AES-NEXT:    eor x11, x11, x3
+; NEON-AES-NEXT:    eor x13, x26, x22
+; NEON-AES-NEXT:    eor x9, x9, x12
+; NEON-AES-NEXT:    eor x11, x11, x1
+; NEON-AES-NEXT:    lsr x1, x8, #1
+; NEON-AES-NEXT:    eor x8, x13, x9
+; NEON-AES-NEXT:    eor x9, x11, x10
+; NEON-AES-NEXT:    eor x0, x8, x9
+; NEON-AES-NEXT:    add sp, sp, #624
+; NEON-AES-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NEON-AES-NEXT:    ret
+  %zextx = zext <1 x i64> %x to <1 x i128>
+  %zexty = zext <1 x i64> %y to <1 x i128>
+  %a = call <1 x i128> @llvm.clmul.v2i128(<1 x i128> %zextx, <1 x i128> %zexty)
+  ret <1 x i128> %a
+}
+
+define <2 x i128> @clmul_v2i128_neon_zext(<2 x i64> %x, <2 x i64> %y) {
+; NEON-LABEL: clmul_v2i128_neon_zext:
+; NEON:       // %bb.0:
+; NEON-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NEON-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NEON-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NEON-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NEON-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NEON-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NEON-NEXT:    sub sp, sp, #1376
+; NEON-NEXT:    .cfi_def_cfa_offset 1472
+; NEON-NEXT:    .cfi_offset w19, -8
+; NEON-NEXT:    .cfi_offset w20, -16
+; NEON-NEXT:    .cfi_offset w21, -24
+; NEON-NEXT:    .cfi_offset w22, -32
+; NEON-NEXT:    .cfi_offset w23, -40
+; NEON-NEXT:    .cfi_offset w24, -48
+; NEON-NEXT:    .cfi_offset w25, -56
+; NEON-NEXT:    .cfi_offset w26, -64
+; NEON-NEXT:    .cfi_offset w27, -72
+; NEON-NEXT:    .cfi_offset w28, -80
+; NEON-NEXT:    .cfi_offset w30, -88
+; NEON-NEXT:    .cfi_offset w29, -96
+; NEON-NEXT:    fmov x9, d1
+; NEON-NEXT:    fmov x8, d0
+; NEON-NEXT:    and x10, x9, #0x2
+; NEON-NEXT:    mul x0, x8, x10
+; NEON-NEXT:    and x10, x9, #0x1
+; NEON-NEXT:    mul x5, x8, x10
+; NEON-NEXT:    and x10, x9, #0x4
+; NEON-NEXT:    mul x7, x8, x10
+; NEON-NEXT:    and x10, x9, #0x8
+; NEON-NEXT:    mul x24, x8, x10
+; NEON-NEXT:    and x10, x9, #0x10
+; NEON-NEXT:    eor x0, x5, x0
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    eor x5, x7, x24
+; NEON-NEXT:    str x10, [sp, #1368] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1360] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x40
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1352] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    ldr x7, [sp, #1352] // 8-byte Reload
+; NEON-NEXT:    str x10, [sp, #1344] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1328] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x200
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    ldr x24, [sp, #1328] // 8-byte Reload
+; NEON-NEXT:    str x10, [sp, #1320] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1336] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1312] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1304] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x2000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1296] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x4000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1288] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x8000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1280] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x10000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1272] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1248] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x40000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1240] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1264] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1232] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x200000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1256] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1216] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1176] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1208] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x2000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1200] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x4000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1192] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x8000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1184] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x10000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1224] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1168] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x40000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1120] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1112] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1160] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x200000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1152] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1136] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1128] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1144] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x2000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1104] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x4000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1048] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x8000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1040] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x10000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1080] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1072] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x40000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1064] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1056] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1096] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x200000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1088] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1008] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #968] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #960] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x2000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #992] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x4000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #984] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x8000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1000] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x10000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #976] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1032] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x40000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1024] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #1016] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #944] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x200000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #904] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #936] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #928] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1000000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #920] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x2000000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #912] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x4000000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #952] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x8000000000000000
+; NEON-NEXT:    rbit x9, x9
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    rbit x8, x8
+; NEON-NEXT:    str x10, [sp, #448] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x2
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #896] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #888] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x4
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #880] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x8
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #872] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x10
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #864] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #856] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x40
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #848] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #840] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #832] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x200
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #824] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #816] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #808] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #800] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x2000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #792] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x4000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #784] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x8000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #776] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x10000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #768] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #744] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x40000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #736] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #760] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #728] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x200000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #752] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #720] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #672] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #704] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x2000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #696] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x4000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #688] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x8000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #680] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x10000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #712] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #664] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x40000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #616] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #608] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #632] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x200000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #624] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #656] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #648] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #640] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x2000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #600] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x4000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #576] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x8000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #568] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x10000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #560] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #552] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x40000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #544] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #536] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #592] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x200000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #584] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #520] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800000000000
+; NEON-NEXT:    mul x11, x8, x10
+; NEON-NEXT:    and x10, x9, #0x1000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    stp x10, x11, [sp, #456] // 16-byte Folded Spill
+; NEON-NEXT:    and x10, x9, #0x2000000000000
+; NEON-NEXT:    mul x11, x8, x10
+; NEON-NEXT:    and x10, x9, #0x4000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    stp x10, x11, [sp, #480] // 16-byte Folded Spill
+; NEON-NEXT:    and x10, x9, #0x8000000000000
+; NEON-NEXT:    mul x11, x8, x10
+; NEON-NEXT:    and x10, x9, #0x10000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #472] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x20000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    stp x10, x11, [sp, #504] // 16-byte Folded Spill
+; NEON-NEXT:    and x10, x9, #0x40000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #496] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x80000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #528] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x100000000000000
+; NEON-NEXT:    mul x11, x8, x10
+; NEON-NEXT:    and x10, x9, #0x200000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #400] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x400000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #424] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x800000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    str x10, [sp, #416] // 8-byte Spill
+; NEON-NEXT:    and x10, x9, #0x1000000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    stp x10, x11, [sp, #432] // 16-byte Folded Spill
+; NEON-NEXT:    and x10, x9, #0x2000000000000000
+; NEON-NEXT:    and x9, x9, #0x4000000000000000
+; NEON-NEXT:    mul x10, x8, x10
+; NEON-NEXT:    mov x11, v1.d[1]
+; NEON-NEXT:    mul x8, x8, x9
+; NEON-NEXT:    str x10, [sp, #408] // 8-byte Spill
+; NEON-NEXT:    mov x10, v0.d[1]
+; NEON-NEXT:    str x8, [sp, #392] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x2
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #296] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x1
+; NEON-NEXT:    mul x9, x10, x8
+; NEON-NEXT:    and x8, x11, #0x4
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #224] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x8
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #168] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x10
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #272] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x20
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #216] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x40
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    stp x8, x9, [sp, #248] // 16-byte Folded Spill
+; NEON-NEXT:    and x8, x11, #0x80
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #136] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x100
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #88] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x200
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    ldr x28, [sp, #88] // 8-byte Reload
+; NEON-NEXT:    str x8, [sp, #104] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x400
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #160] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x800
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #264] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x1000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #208] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x2000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #240] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x4000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #288] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x8000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #304] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x10000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #48] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x20000
+; NEON-NEXT:    mul x26, x10, x8
+; NEON-NEXT:    and x8, x11, #0x40000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #16] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x80000
+; NEON-NEXT:    mul x9, x10, x8
+; NEON-NEXT:    and x8, x11, #0x100000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    stp x9, x8, [sp, #72] // 16-byte Folded Spill
+; NEON-NEXT:    and x8, x11, #0x200000
+; NEON-NEXT:    mul x9, x10, x8
+; NEON-NEXT:    and x8, x11, #0x400000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #184] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x800000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #128] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x1000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #120] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x2000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    stp x8, x9, [sp, #144] // 16-byte Folded Spill
+; NEON-NEXT:    and x8, x11, #0x4000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #200] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x8000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #232] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x10000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #280] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x20000000
+; NEON-NEXT:    mul x20, x10, x8
+; NEON-NEXT:    and x8, x11, #0x40000000
+; NEON-NEXT:    mul x15, x10, x8
+; NEON-NEXT:    and x8, x11, #0x80000000
+; NEON-NEXT:    mul x19, x10, x8
+; NEON-NEXT:    and x8, x11, #0x100000000
+; NEON-NEXT:    mul x22, x10, x8
+; NEON-NEXT:    and x8, x11, #0x200000000
+; NEON-NEXT:    eor x15, x20, x15
+; NEON-NEXT:    mul x25, x10, x8
+; NEON-NEXT:    and x8, x11, #0x400000000
+; NEON-NEXT:    eor x15, x15, x19
+; NEON-NEXT:    mul x29, x10, x8
+; NEON-NEXT:    and x8, x11, #0x800000000
+; NEON-NEXT:    eor x15, x15, x22
+; NEON-NEXT:    mul x9, x10, x8
+; NEON-NEXT:    and x8, x11, #0x1000000000
+; NEON-NEXT:    eor x15, x15, x25
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    eor x15, x15, x29
+; NEON-NEXT:    str x8, [sp, #64] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x2000000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    stp x8, x9, [sp, #32] // 16-byte Folded Spill
+; NEON-NEXT:    and x8, x11, #0x4000000000
+; NEON-NEXT:    and x9, x11, #0x400000000000000
+; NEON-NEXT:    mul x27, x10, x8
+; NEON-NEXT:    and x8, x11, #0x8000000000
+; NEON-NEXT:    mul x30, x10, x8
+; NEON-NEXT:    and x8, x11, #0x10000000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    mul x13, x10, x9
+; NEON-NEXT:    rbit x9, x10
+; NEON-NEXT:    str x8, [sp, #24] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x20000000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #56] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x40000000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #96] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x80000000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #112] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x100000000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #176] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x200000000000
+; NEON-NEXT:    mul x8, x10, x8
+; NEON-NEXT:    str x8, [sp, #192] // 8-byte Spill
+; NEON-NEXT:    and x8, x11, #0x400000000000
+; NEON-NEXT:    mul x17, x10, x8
+; NEON-NEXT:    and x8, x11, #0x800000000000
+; NEON-NEXT:    mul x12, x10, x8
+; NEON-NEXT:    and x8, x11, #0x1000000000000
+; NEON-NEXT:    mul x14, x10, x8
+; NEON-NEXT:    and x8, x11, #0x2000000000000
+; NEON-NEXT:    mul x18, x10, x8
+; NEON-NEXT:    and x8, x11, #0x4000000000000
+; NEON-NEXT:    eor x12, x17, x12
+; NEON-NEXT:    mul x1, x10, x8
+; NEON-NEXT:    and x8, x11, #0x8000000000000
+; NEON-NEXT:    eor x12, x12, x14
+; NEON-NEXT:    mul x3, x10, x8
+; NEON-NEXT:    and x8, x11, #0x10000000000000
+; NEON-NEXT:    eor x12, x12, x18
+; NEON-NEXT:    mul x6, x10, x8
+; NEON-NEXT:    and x8, x11, #0x20000000000000
+; NEON-NEXT:    eor x12, x12, x1
+; NEON-NEXT:    mul x21, x10, x8
+; NEON-NEXT:    and x8, x11, #0x40000000000000
+; NEON-NEXT:    eor x12, x12, x3
+; NEON-NEXT:    mul x23, x10, x8
+; NEON-NEXT:    and x8, x11, #0x100000000000000
+; NEON-NEXT:    eor x12, x12, x6
+; NEON-NEXT:    mul x4, x10, x8
+; NEON-NEXT:    and x8, x11, #0x200000000000000
+; NEON-NEXT:    eor x12, x12, x21
+; NEON-NEXT:    mul x2, x10, x8
+; NEON-NEXT:    rbit x8, x11
+; NEON-NEXT:    eor x12, x12, x23
+; NEON-NEXT:    and x16, x8, #0x2
+; NEON-NEXT:    and x14, x8, #0x800000000
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    mul x14, x9, x14
+; NEON-NEXT:    str x16, [sp, #384] // 8-byte Spill
+; NEON-NEXT:    and x16, x8, #0x1
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    str x16, [sp, #376] // 8-byte Spill
+; NEON-NEXT:    and x16, x8, #0x4
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    str x16, [sp, #368] // 8-byte Spill
+; NEON-NEXT:    and x16, x8, #0x8
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    str x16, [sp, #360] // 8-byte Spill
+; NEON-NEXT:    and x16, x8, #0x10
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    str x16, [sp, #352] // 8-byte Spill
+; NEON-NEXT:    and x16, x8, #0x20
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    str x16, [sp, #336] // 8-byte Spill
+; NEON-NEXT:    and x16, x8, #0x40
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    str x16, [sp, #344] // 8-byte Spill
+; NEON-NEXT:    and x16, x8, #0x80
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    str x16, [sp, #328] // 8-byte Spill
+; NEON-NEXT:    and x16, x8, #0x100
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    str x16, [sp, #320] // 8-byte Spill
+; NEON-NEXT:    and x16, x8, #0x200
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    str x16, [sp, #312] // 8-byte Spill
+; NEON-NEXT:    eor x16, x0, x5
+; NEON-NEXT:    ldr x0, [sp, #1368] // 8-byte Reload
+; NEON-NEXT:    ldr x5, [sp, #1360] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    and x5, x8, #0x400
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    ldr x7, [sp, #1344] // 8-byte Reload
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    ldr x0, [sp, #1320] // 8-byte Reload
+; NEON-NEXT:    eor x7, x7, x24
+; NEON-NEXT:    ldr x24, [sp, #1304] // 8-byte Reload
+; NEON-NEXT:    eor x0, x7, x0
+; NEON-NEXT:    ldr x7, [sp, #1336] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1360] // 8-byte Spill
+; NEON-NEXT:    and x5, x8, #0x800
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    ldr x7, [sp, #1312] // 8-byte Reload
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    ldr x0, [sp, #1296] // 8-byte Reload
+; NEON-NEXT:    eor x7, x7, x24
+; NEON-NEXT:    ldr x24, [sp, #1248] // 8-byte Reload
+; NEON-NEXT:    eor x0, x7, x0
+; NEON-NEXT:    ldr x7, [sp, #1288] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1352] // 8-byte Spill
+; NEON-NEXT:    and x5, x8, #0x1000
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    ldr x7, [sp, #1272] // 8-byte Reload
+; NEON-NEXT:    eor x7, x7, x24
+; NEON-NEXT:    ldr x24, [sp, #1256] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1336] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #1280] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    ldr x5, [sp, #1240] // 8-byte Reload
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    ldr x0, [sp, #1264] // 8-byte Reload
+; NEON-NEXT:    eor x5, x7, x5
+; NEON-NEXT:    and x7, x8, #0x2000
+; NEON-NEXT:    eor x0, x5, x0
+; NEON-NEXT:    mul x5, x9, x7
+; NEON-NEXT:    ldr x7, [sp, #1176] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1328] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #1232] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    ldr x5, [sp, #1216] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x24
+; NEON-NEXT:    ldr x24, [sp, #1208] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x7
+; NEON-NEXT:    and x7, x8, #0x4000
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    ldr x0, [sp, #1200] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x24
+; NEON-NEXT:    mul x7, x9, x7
+; NEON-NEXT:    ldr x24, [sp, #1120] // 8-byte Reload
+; NEON-NEXT:    str x14, [sp, #1120] // 8-byte Spill
+; NEON-NEXT:    ldr x14, [sp, #112] // 8-byte Reload
+; NEON-NEXT:    eor x0, x5, x0
+; NEON-NEXT:    and x5, x8, #0x8000
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    str x7, [sp, #1320] // 8-byte Spill
+; NEON-NEXT:    ldr x7, [sp, #1192] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    ldr x7, [sp, #1168] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1312] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #1184] // 8-byte Reload
+; NEON-NEXT:    eor x7, x7, x24
+; NEON-NEXT:    ldr x24, [sp, #1224] // 8-byte Reload
+; NEON-NEXT:    str x12, [sp, #1184] // 8-byte Spill
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    ldr x5, [sp, #1112] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x24
+; NEON-NEXT:    ldr x24, [sp, #1160] // 8-byte Reload
+; NEON-NEXT:    eor x5, x7, x5
+; NEON-NEXT:    and x7, x8, #0x10000
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    mul x7, x9, x7
+; NEON-NEXT:    ldr x0, [sp, #1152] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x24
+; NEON-NEXT:    ldr x24, [sp, #1048] // 8-byte Reload
+; NEON-NEXT:    eor x0, x5, x0
+; NEON-NEXT:    and x5, x8, #0x20000
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    str x7, [sp, #1304] // 8-byte Spill
+; NEON-NEXT:    ldr x7, [sp, #1136] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    ldr x7, [sp, #1104] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1296] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #1128] // 8-byte Reload
+; NEON-NEXT:    eor x7, x7, x24
+; NEON-NEXT:    ldr x24, [sp, #1144] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    ldr x5, [sp, #1040] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x24
+; NEON-NEXT:    ldr x24, [sp, #1080] // 8-byte Reload
+; NEON-NEXT:    eor x5, x7, x5
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    ldr x0, [sp, #1072] // 8-byte Reload
+; NEON-NEXT:    and x7, x8, #0x40000
+; NEON-NEXT:    eor x5, x5, x24
+; NEON-NEXT:    ldr x24, [sp, #968] // 8-byte Reload
+; NEON-NEXT:    mul x7, x9, x7
+; NEON-NEXT:    eor x0, x5, x0
+; NEON-NEXT:    and x5, x8, #0x80000
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    str x7, [sp, #1288] // 8-byte Spill
+; NEON-NEXT:    ldr x7, [sp, #1064] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1280] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #1056] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    ldr x7, [sp, #1008] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    ldr x5, [sp, #960] // 8-byte Reload
+; NEON-NEXT:    eor x7, x7, x24
+; NEON-NEXT:    ldr x24, [sp, #1096] // 8-byte Reload
+; NEON-NEXT:    eor x5, x7, x5
+; NEON-NEXT:    and x7, x8, #0x100000
+; NEON-NEXT:    mul x7, x9, x7
+; NEON-NEXT:    eor x0, x0, x24
+; NEON-NEXT:    ldr x24, [sp, #992] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x24
+; NEON-NEXT:    ldr x24, [sp, #1032] // 8-byte Reload
+; NEON-NEXT:    str x7, [sp, #1272] // 8-byte Spill
+; NEON-NEXT:    ldr x7, [sp, #1088] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    ldr x7, [sp, #984] // 8-byte Reload
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    ldr x0, [sp, #1000] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x7
+; NEON-NEXT:    and x7, x8, #0x200000
+; NEON-NEXT:    eor x0, x5, x0
+; NEON-NEXT:    mul x5, x9, x7
+; NEON-NEXT:    ldr x7, [sp, #904] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1264] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #976] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    ldr x5, [sp, #944] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x24
+; NEON-NEXT:    ldr x24, [sp, #936] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x7
+; NEON-NEXT:    and x7, x8, #0x400000
+; NEON-NEXT:    mul x7, x9, x7
+; NEON-NEXT:    eor x5, x5, x24
+; NEON-NEXT:    ldr x24, [sp, #1016] // 8-byte Reload
+; NEON-NEXT:    str x7, [sp, #1256] // 8-byte Spill
+; NEON-NEXT:    ldr x7, [sp, #1024] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    ldr x7, [sp, #928] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x24
+; NEON-NEXT:    ldr x24, [sp, #920] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x7
+; NEON-NEXT:    and x7, x8, #0x800000
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    mul x7, x9, x7
+; NEON-NEXT:    and x0, x8, #0x1000000
+; NEON-NEXT:    str x16, [sp, #1368] // 8-byte Spill
+; NEON-NEXT:    ldr x16, [sp, #912] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x24
+; NEON-NEXT:    ldr x24, [sp, #168] // 8-byte Reload
+; NEON-NEXT:    mul x0, x9, x0
+; NEON-NEXT:    eor x16, x5, x16
+; NEON-NEXT:    ldr x5, [sp, #296] // 8-byte Reload
+; NEON-NEXT:    str x7, [sp, #1240] // 8-byte Spill
+; NEON-NEXT:    ldr x7, [sp, #256] // 8-byte Reload
+; NEON-NEXT:    eor x5, x7, x5
+; NEON-NEXT:    ldr x7, [sp, #224] // 8-byte Reload
+; NEON-NEXT:    str x0, [sp, #1232] // 8-byte Spill
+; NEON-NEXT:    ldr x0, [sp, #952] // 8-byte Reload
+; NEON-NEXT:    eor x7, x7, x24
+; NEON-NEXT:    ldr x24, [sp, #216] // 8-byte Reload
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    eor x0, x5, x7
+; NEON-NEXT:    and x5, x8, #0x2000000
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    ldr x7, [sp, #272] // 8-byte Reload
+; NEON-NEXT:    eor x7, x7, x24
+; NEON-NEXT:    ldr x24, [sp, #136] // 8-byte Reload
+; NEON-NEXT:    eor x24, x24, x28
+; NEON-NEXT:    str x5, [sp, #1224] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #248] // 8-byte Reload
+; NEON-NEXT:    eor x5, x7, x5
+; NEON-NEXT:    ldr x7, [sp, #104] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    ldr x5, [sp, #160] // 8-byte Reload
+; NEON-NEXT:    eor x7, x24, x7
+; NEON-NEXT:    and x24, x8, #0x4000000
+; NEON-NEXT:    eor x5, x7, x5
+; NEON-NEXT:    mul x7, x9, x24
+; NEON-NEXT:    ldr x24, [sp, #288] // 8-byte Reload
+; NEON-NEXT:    str x7, [sp, #1216] // 8-byte Spill
+; NEON-NEXT:    ldr x7, [sp, #448] // 8-byte Reload
+; NEON-NEXT:    eor x16, x16, x7
+; NEON-NEXT:    ldr x7, [sp, #208] // 8-byte Reload
+; NEON-NEXT:    str x16, [sp, #1344] // 8-byte Spill
+; NEON-NEXT:    eor x16, x0, x5
+; NEON-NEXT:    and x0, x8, #0x8000000
+; NEON-NEXT:    mul x0, x9, x0
+; NEON-NEXT:    ldr x5, [sp, #264] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x7
+; NEON-NEXT:    ldr x7, [sp, #48] // 8-byte Reload
+; NEON-NEXT:    eor x7, x7, x26
+; NEON-NEXT:    str x0, [sp, #1208] // 8-byte Spill
+; NEON-NEXT:    ldr x0, [sp, #240] // 8-byte Reload
+; NEON-NEXT:    eor x0, x5, x0
+; NEON-NEXT:    ldr x5, [sp, #16] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x24
+; NEON-NEXT:    ldr x24, [sp, #72] // 8-byte Reload
+; NEON-NEXT:    eor x5, x7, x5
+; NEON-NEXT:    and x7, x8, #0x10000000
+; NEON-NEXT:    mul x7, x9, x7
+; NEON-NEXT:    eor x5, x5, x24
+; NEON-NEXT:    str x7, [sp, #1192] // 8-byte Spill
+; NEON-NEXT:    ldr x7, [sp, #304] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    ldr x7, [sp, #80] // 8-byte Reload
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    ldr x0, [sp, #152] // 8-byte Reload
+; NEON-NEXT:    eor x5, x5, x7
+; NEON-NEXT:    and x7, x8, #0x20000000
+; NEON-NEXT:    eor x0, x5, x0
+; NEON-NEXT:    mul x5, x9, x7
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    ldr x0, [sp, #184] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1176] // 8-byte Spill
+; NEON-NEXT:    ldp x7, x5, [sp, #120] // 16-byte Folded Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    and x5, x8, #0x40000000
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    ldr x7, [sp, #200] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1160] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #144] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    and x5, x8, #0x80000000
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    ldr x7, [sp, #280] // 8-byte Reload
+; NEON-NEXT:    str x5, [sp, #1152] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #232] // 8-byte Reload
+; NEON-NEXT:    eor x0, x0, x5
+; NEON-NEXT:    and x5, x8, #0x100000000
+; NEON-NEXT:    mul x5, x9, x5
+; NEON-NEXT:    eor x0, x0, x7
+; NEON-NEXT:    eor x16, x16, x0
+; NEON-NEXT:    ldr x0, [sp, #40] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x0
+; NEON-NEXT:    and x0, x8, #0x200000000
+; NEON-NEXT:    mul x0, x9, x0
+; NEON-NEXT:    str x5, [sp, #1144] // 8-byte Spill
+; NEON-NEXT:    ldr x5, [sp, #64] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x5
+; NEON-NEXT:    ldr x5, [sp, #32] // 8-byte Reload
+; NEON-NEXT:    eor x15, x16, x15
+; NEON-NEXT:    and x16, x8, #0x400000000
+; NEON-NEXT:    mul x16, x9, x16
+; NEON-NEXT:    eor x5, x5, x27
+; NEON-NEXT:    str x0, [sp, #1136] // 8-byte Spill
+; NEON-NEXT:    ldr x0, [sp, #24] // 8-byte Reload
+; NEON-NEXT:    str x15, [sp, #1248] // 8-byte Spill
+; NEON-NEXT:    eor x15, x5, x30
+; NEON-NEXT:    eor x15, x15, x0
+; NEON-NEXT:    str x16, [sp, #1128] // 8-byte Spill
+; NEON-NEXT:    ldr x16, [sp, #56] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    ldr x16, [sp, #96] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    ldr x16, [sp, #176] // 8-byte Reload
+; NEON-NEXT:    eor x14, x15, x14
+; NEON-NEXT:    and x15, x8, #0x1000000000
+; NEON-NEXT:    mul x15, x9, x15
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    ldr x16, [sp, #832] // 8-byte Reload
+; NEON-NEXT:    str x15, [sp, #1112] // 8-byte Spill
+; NEON-NEXT:    ldr x15, [sp, #192] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    eor x15, x4, x2
+; NEON-NEXT:    eor x12, x15, x13
+; NEON-NEXT:    str x14, [sp, #1200] // 8-byte Spill
+; NEON-NEXT:    and x14, x8, #0x2000000000
+; NEON-NEXT:    str x12, [sp, #1168] // 8-byte Spill
+; NEON-NEXT:    and x12, x8, #0x4000000000
+; NEON-NEXT:    mul x28, x9, x14
+; NEON-NEXT:    ldr x13, [sp, #896] // 8-byte Reload
+; NEON-NEXT:    ldr x14, [sp, #888] // 8-byte Reload
+; NEON-NEXT:    ldr x15, [sp, #872] // 8-byte Reload
+; NEON-NEXT:    mul x12, x9, x12
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    ldr x14, [sp, #880] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    ldr x15, [sp, #848] // 8-byte Reload
+; NEON-NEXT:    str x12, [sp, #1104] // 8-byte Spill
+; NEON-NEXT:    eor x12, x13, x14
+; NEON-NEXT:    ldr x13, [sp, #864] // 8-byte Reload
+; NEON-NEXT:    ldr x14, [sp, #856] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    and x14, x8, #0x8000000000
+; NEON-NEXT:    mul x14, x9, x14
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #840] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #824] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    ldr x16, [sp, #800] // 8-byte Reload
+; NEON-NEXT:    eor x13, x15, x13
+; NEON-NEXT:    ldr x15, [sp, #816] // 8-byte Reload
+; NEON-NEXT:    str x14, [sp, #1096] // 8-byte Spill
+; NEON-NEXT:    and x14, x8, #0x10000000000
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    mul x14, x9, x14
+; NEON-NEXT:    ldr x15, [sp, #808] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #792] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    ldr x16, [sp, #744] // 8-byte Reload
+; NEON-NEXT:    eor x13, x15, x13
+; NEON-NEXT:    ldr x15, [sp, #784] // 8-byte Reload
+; NEON-NEXT:    str x14, [sp, #1088] // 8-byte Spill
+; NEON-NEXT:    and x14, x8, #0x20000000000
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    mul x26, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #776] // 8-byte Reload
+; NEON-NEXT:    ldr x15, [sp, #768] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #736] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #760] // 8-byte Reload
+; NEON-NEXT:    ldr x16, [sp, #752] // 8-byte Reload
+; NEON-NEXT:    eor x14, x15, x14
+; NEON-NEXT:    and x15, x8, #0x40000000000
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    ldr x14, [sp, #728] // 8-byte Reload
+; NEON-NEXT:    mul x25, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #672] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #720] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #704] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    and x15, x8, #0x80000000000
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #696] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    mul x27, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #688] // 8-byte Reload
+; NEON-NEXT:    ldr x16, [sp, #616] // 8-byte Reload
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    and x14, x8, #0x100000000000
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #664] // 8-byte Reload
+; NEON-NEXT:    mul x29, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #680] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    ldr x16, [sp, #712] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #608] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #632] // 8-byte Reload
+; NEON-NEXT:    eor x14, x15, x14
+; NEON-NEXT:    and x15, x8, #0x200000000000
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #624] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    mul x30, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #656] // 8-byte Reload
+; NEON-NEXT:    ldr x16, [sp, #576] // 8-byte Reload
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    and x14, x8, #0x400000000000
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #600] // 8-byte Reload
+; NEON-NEXT:    mul x24, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #648] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    ldr x16, [sp, #640] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #568] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #560] // 8-byte Reload
+; NEON-NEXT:    eor x14, x15, x14
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #552] // 8-byte Reload
+; NEON-NEXT:    and x15, x8, #0x800000000000
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    ldr x16, [sp, #464] // 8-byte Reload
+; NEON-NEXT:    mul x23, x9, x15
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    and x14, x8, #0x1000000000000
+; NEON-NEXT:    ldr x15, [sp, #544] // 8-byte Reload
+; NEON-NEXT:    mul x22, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #536] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #520] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #456] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    ldr x16, [sp, #592] // 8-byte Reload
+; NEON-NEXT:    eor x14, x15, x14
+; NEON-NEXT:    and x15, x8, #0x2000000000000
+; NEON-NEXT:    mul x20, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #584] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #488] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #480] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldp x16, x13, [sp, #504] // 16-byte Folded Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    and x15, x8, #0x4000000000000
+; NEON-NEXT:    mul x7, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #400] // 8-byte Reload
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    ldr x14, [sp, #472] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #440] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #424] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    and x15, x8, #0x8000000000000
+; NEON-NEXT:    mul x6, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #496] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    ldr x16, [sp, #528] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    ldr x15, [sp, #416] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #432] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x15
+; NEON-NEXT:    eor x21, x12, x13
+; NEON-NEXT:    ldr x12, [sp, #408] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x16
+; NEON-NEXT:    and x13, x8, #0x20000000000000
+; NEON-NEXT:    and x15, x8, #0x10000000000000
+; NEON-NEXT:    eor x12, x14, x12
+; NEON-NEXT:    ldr x14, [sp, #392] // 8-byte Reload
+; NEON-NEXT:    mul x4, x9, x13
+; NEON-NEXT:    ldr x16, [sp, #1336] // 8-byte Reload
+; NEON-NEXT:    eor x19, x12, x14
+; NEON-NEXT:    ldp x14, x12, [sp, #376] // 16-byte Folded Reload
+; NEON-NEXT:    mul x5, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #336] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x12
+; NEON-NEXT:    ldp x13, x12, [sp, #360] // 16-byte Folded Reload
+; NEON-NEXT:    eor x13, x12, x13
+; NEON-NEXT:    ldr x12, [sp, #352] // 8-byte Reload
+; NEON-NEXT:    eor x14, x14, x13
+; NEON-NEXT:    ldr x13, [sp, #344] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x15
+; NEON-NEXT:    and x15, x8, #0x40000000000000
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    mul x3, x9, x15
+; NEON-NEXT:    and x15, x8, #0x80000000000000
+; NEON-NEXT:    eor x14, x14, x12
+; NEON-NEXT:    ldp x13, x12, [sp, #320] // 16-byte Folded Reload
+; NEON-NEXT:    mul x2, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #1360] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #312] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #1352] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x15
+; NEON-NEXT:    ldr x15, [sp, #1328] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    eor x12, x14, x12
+; NEON-NEXT:    ldr x14, [sp, #1320] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x15
+; NEON-NEXT:    and x15, x8, #0x100000000000000
+; NEON-NEXT:    ldr x16, [sp, #1272] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #1312] // 8-byte Reload
+; NEON-NEXT:    mul x1, x9, x15
+; NEON-NEXT:    ldr x15, [sp, #1296] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #1304] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #1288] // 8-byte Reload
+; NEON-NEXT:    eor x15, x14, x15
+; NEON-NEXT:    and x14, x8, #0x200000000000000
+; NEON-NEXT:    mul x0, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #1280] // 8-byte Reload
+; NEON-NEXT:    eor x13, x15, x13
+; NEON-NEXT:    ldr x15, [sp, #1240] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #1256] // 8-byte Reload
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #1232] // 8-byte Reload
+; NEON-NEXT:    eor x15, x14, x15
+; NEON-NEXT:    and x14, x8, #0x400000000000000
+; NEON-NEXT:    mul x18, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #1264] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    ldr x16, [sp, #1192] // 8-byte Reload
+; NEON-NEXT:    eor x0, x1, x0
+; NEON-NEXT:    and x1, x11, #0x2000000000000000
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #1224] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x13
+; NEON-NEXT:    ldr x13, [sp, #1216] // 8-byte Reload
+; NEON-NEXT:    eor x15, x15, x14
+; NEON-NEXT:    and x14, x8, #0x800000000000000
+; NEON-NEXT:    mul x17, x9, x14
+; NEON-NEXT:    ldr x14, [sp, #1208] // 8-byte Reload
+; NEON-NEXT:    eor x13, x15, x13
+; NEON-NEXT:    ldr x15, [sp, #1160] // 8-byte Reload
+; NEON-NEXT:    eor x18, x0, x18
+; NEON-NEXT:    eor x13, x13, x14
+; NEON-NEXT:    ldr x14, [sp, #1176] // 8-byte Reload
+; NEON-NEXT:    mul x0, x10, x1
+; NEON-NEXT:    eor x13, x13, x16
+; NEON-NEXT:    ldr x16, [sp, #1152] // 8-byte Reload
+; NEON-NEXT:    eor x15, x14, x15
+; NEON-NEXT:    and x14, x8, #0x1000000000000000
+; NEON-NEXT:    eor x15, x15, x16
+; NEON-NEXT:    mul x16, x9, x14
+; NEON-NEXT:    eor x14, x12, x13
+; NEON-NEXT:    ldr x12, [sp, #1144] // 8-byte Reload
+; NEON-NEXT:    eor x17, x18, x17
+; NEON-NEXT:    and x18, x11, #0x4000000000000000
+; NEON-NEXT:    eor x13, x15, x12
+; NEON-NEXT:    ldr x12, [sp, #1136] // 8-byte Reload
+; NEON-NEXT:    and x15, x8, #0x2000000000000000
+; NEON-NEXT:    and x8, x8, #0x4000000000000000
+; NEON-NEXT:    mul x15, x9, x15
+; NEON-NEXT:    eor x13, x13, x12
+; NEON-NEXT:    ldr x12, [sp, #1104] // 8-byte Reload
+; NEON-NEXT:    eor x16, x17, x16
+; NEON-NEXT:    mul x8, x9, x8
+; NEON-NEXT:    ldr x9, [sp, #1112] // 8-byte Reload
+; NEON-NEXT:    eor x12, x28, x12
+; NEON-NEXT:    ldr x28, [sp, #1128] // 8-byte Reload
+; NEON-NEXT:    mul x17, x10, x18
+; NEON-NEXT:    eor x13, x13, x28
+; NEON-NEXT:    ldr x28, [sp, #1096] // 8-byte Reload
+; NEON-NEXT:    eor x15, x16, x15
+; NEON-NEXT:    eor x12, x12, x28
+; NEON-NEXT:    ldr x28, [sp, #1120] // 8-byte Reload
+; NEON-NEXT:    eor x8, x15, x8
+; NEON-NEXT:    eor x13, x13, x28
+; NEON-NEXT:    ldr x28, [sp, #1088] // 8-byte Reload
+; NEON-NEXT:    eor x9, x13, x9
+; NEON-NEXT:    and x13, x11, #0x80000000000000
+; NEON-NEXT:    eor x12, x12, x28
+; NEON-NEXT:    eor x9, x14, x9
+; NEON-NEXT:    eor x14, x24, x23
+; NEON-NEXT:    eor x12, x12, x26
+; NEON-NEXT:    eor x14, x14, x22
+; NEON-NEXT:    and x23, x11, #0x800000000000000
+; NEON-NEXT:    eor x12, x12, x25
+; NEON-NEXT:    eor x14, x14, x20
+; NEON-NEXT:    mul x22, x10, x23
+; NEON-NEXT:    eor x12, x12, x27
+; NEON-NEXT:    and x20, x11, #0x1000000000000000
+; NEON-NEXT:    and x11, x11, #0x8000000000000000
+; NEON-NEXT:    eor x12, x12, x29
+; NEON-NEXT:    mul x13, x10, x13
+; NEON-NEXT:    eor x12, x12, x30
+; NEON-NEXT:    eor x9, x9, x12
+; NEON-NEXT:    eor x12, x14, x7
+; NEON-NEXT:    mul x14, x10, x20
+; NEON-NEXT:    eor x12, x12, x6
+; NEON-NEXT:    eor x12, x12, x5
+; NEON-NEXT:    mul x10, x10, x11
+; NEON-NEXT:    ldr x11, [sp, #1168] // 8-byte Reload
+; NEON-NEXT:    eor x12, x12, x4
+; NEON-NEXT:    eor x12, x12, x3
+; NEON-NEXT:    eor x11, x11, x22
+; NEON-NEXT:    eor x12, x12, x2
+; NEON-NEXT:    eor x9, x9, x12
+; NEON-NEXT:    eor x12, x21, x19
+; NEON-NEXT:    eor x8, x9, x8
+; NEON-NEXT:    eor x9, x11, x14
+; NEON-NEXT:    rbit x11, x12
+; NEON-NEXT:    ldr x12, [sp, #1248] // 8-byte Reload
+; NEON-NEXT:    ldr x14, [sp, #1200] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x0
+; NEON-NEXT:    rbit x8, x8
+; NEON-NEXT:    eor x9, x9, x17
+; NEON-NEXT:    lsr x1, x11, #1
+; NEON-NEXT:    eor x12, x12, x14
+; NEON-NEXT:    ldr x14, [sp, #1184] // 8-byte Reload
+; NEON-NEXT:    eor x9, x9, x10
+; NEON-NEXT:    ldr x10, [sp, #1368] // 8-byte Reload
+; NEON-NEXT:    ldr x11, [sp, #1344] // 8-byte Reload
+; NEON-NEXT:    lsr x3, x8, #1
+; NEON-NEXT:    eor x13, x14, x13
+; NEON-NEXT:    eor x8, x12, x13
+; NEON-NEXT:    eor x0, x10, x11
+; NEON-NEXT:    eor x2, x8, x9
+; NEON-NEXT:    add sp, sp, #1376
+; NEON-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NEON-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NEON-NEXT:    ret
+;
+; NEON-AES-LABEL: clmul_v2i128_neon_zext:
+; NEON-AES:       // %bb.0:
+; NEON-AES-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NEON-AES-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NEON-AES-NEXT:    sub sp, sp, #1376
+; NEON-AES-NEXT:    .cfi_def_cfa_offset 1472
+; NEON-AES-NEXT:    .cfi_offset w19, -8
+; NEON-AES-NEXT:    .cfi_offset w20, -16
+; NEON-AES-NEXT:    .cfi_offset w21, -24
+; NEON-AES-NEXT:    .cfi_offset w22, -32
+; NEON-AES-NEXT:    .cfi_offset w23, -40
+; NEON-AES-NEXT:    .cfi_offset w24, -48
+; NEON-AES-NEXT:    .cfi_offset w25, -56
+; NEON-AES-NEXT:    .cfi_offset w26, -64
+; NEON-AES-NEXT:    .cfi_offset w27, -72
+; NEON-AES-NEXT:    .cfi_offset w28, -80
+; NEON-AES-NEXT:    .cfi_offset w30, -88
+; NEON-AES-NEXT:    .cfi_offset w29, -96
+; NEON-AES-NEXT:    fmov x9, d1
+; NEON-AES-NEXT:    fmov x8, d0
+; NEON-AES-NEXT:    and x10, x9, #0x2
+; NEON-AES-NEXT:    mul x0, x8, x10
+; NEON-AES-NEXT:    and x10, x9, #0x1
+; NEON-AES-NEXT:    mul x5, x8, x10
+; NEON-AES-NEXT:    and x10, x9, #0x4
+; NEON-AES-NEXT:    mul x7, x8, x10
+; NEON-AES-NEXT:    and x10, x9, #0x8
+; NEON-AES-NEXT:    mul x24, x8, x10
+; NEON-AES-NEXT:    and x10, x9, #0x10
+; NEON-AES-NEXT:    eor x0, x5, x0
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    eor x5, x7, x24
+; NEON-AES-NEXT:    str x10, [sp, #1368] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1360] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1352] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    ldr x7, [sp, #1352] // 8-byte Reload
+; NEON-AES-NEXT:    str x10, [sp, #1344] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1328] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x200
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    ldr x24, [sp, #1328] // 8-byte Reload
+; NEON-AES-NEXT:    str x10, [sp, #1320] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1336] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1312] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1304] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1296] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x4000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1288] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1280] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x10000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1272] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1248] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1240] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1264] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1232] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x200000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1256] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1216] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1176] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1208] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1200] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x4000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1192] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1184] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x10000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1224] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1168] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1120] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1112] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1160] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x200000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1152] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1136] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1128] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1144] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1104] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x4000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1048] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1040] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x10000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1080] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1072] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1064] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1056] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1096] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x200000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1088] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1008] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #968] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #960] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #992] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x4000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #984] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1000] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x10000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #976] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1032] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1024] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #1016] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #944] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x200000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #904] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #936] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #928] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1000000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #920] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #912] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x4000000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #952] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8000000000000000
+; NEON-AES-NEXT:    rbit x9, x9
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    rbit x8, x8
+; NEON-AES-NEXT:    str x10, [sp, #448] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #896] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #888] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x4
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #880] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #872] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x10
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #864] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #856] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #848] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #840] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #832] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x200
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #824] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #816] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #808] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #800] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #792] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x4000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #784] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #776] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x10000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #768] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #744] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #736] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #760] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #728] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x200000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #752] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #720] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #672] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #704] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #696] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x4000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #688] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #680] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x10000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #712] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #664] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #616] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #608] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #632] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x200000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #624] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #656] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #648] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #640] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #600] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x4000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #576] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #568] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x10000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #560] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #552] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #544] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #536] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #592] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x200000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #584] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #520] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800000000000
+; NEON-AES-NEXT:    mul x11, x8, x10
+; NEON-AES-NEXT:    and x10, x9, #0x1000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    stp x10, x11, [sp, #456] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000000000000
+; NEON-AES-NEXT:    mul x11, x8, x10
+; NEON-AES-NEXT:    and x10, x9, #0x4000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    stp x10, x11, [sp, #480] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x10, x9, #0x8000000000000
+; NEON-AES-NEXT:    mul x11, x8, x10
+; NEON-AES-NEXT:    and x10, x9, #0x10000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #472] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x20000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    stp x10, x11, [sp, #504] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x10, x9, #0x40000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #496] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x80000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #528] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x100000000000000
+; NEON-AES-NEXT:    mul x11, x8, x10
+; NEON-AES-NEXT:    and x10, x9, #0x200000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #400] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x400000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #424] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x800000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    str x10, [sp, #416] // 8-byte Spill
+; NEON-AES-NEXT:    and x10, x9, #0x1000000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    stp x10, x11, [sp, #432] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x10, x9, #0x2000000000000000
+; NEON-AES-NEXT:    and x9, x9, #0x4000000000000000
+; NEON-AES-NEXT:    mul x10, x8, x10
+; NEON-AES-NEXT:    mov x11, v1.d[1]
+; NEON-AES-NEXT:    mul x8, x8, x9
+; NEON-AES-NEXT:    str x10, [sp, #408] // 8-byte Spill
+; NEON-AES-NEXT:    mov x10, v0.d[1]
+; NEON-AES-NEXT:    str x8, [sp, #392] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x2
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #296] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x1
+; NEON-AES-NEXT:    mul x9, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x4
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #224] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x8
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #168] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x10
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #272] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x20
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #216] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x40
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    stp x8, x9, [sp, #248] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x8, x11, #0x80
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #136] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x100
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #88] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x200
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    ldr x28, [sp, #88] // 8-byte Reload
+; NEON-AES-NEXT:    str x8, [sp, #104] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x400
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #160] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x800
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #264] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x1000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #208] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x2000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #240] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x4000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #288] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x8000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #304] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x10000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #48] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x20000
+; NEON-AES-NEXT:    mul x26, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x40000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #16] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x80000
+; NEON-AES-NEXT:    mul x9, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x100000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    stp x9, x8, [sp, #72] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x8, x11, #0x200000
+; NEON-AES-NEXT:    mul x9, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x400000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #184] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x800000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #128] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x1000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #120] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x2000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    stp x8, x9, [sp, #144] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x8, x11, #0x4000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #200] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x8000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #232] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x10000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #280] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x20000000
+; NEON-AES-NEXT:    mul x20, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x40000000
+; NEON-AES-NEXT:    mul x15, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x80000000
+; NEON-AES-NEXT:    mul x19, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x100000000
+; NEON-AES-NEXT:    mul x22, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x200000000
+; NEON-AES-NEXT:    eor x15, x20, x15
+; NEON-AES-NEXT:    mul x25, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x400000000
+; NEON-AES-NEXT:    eor x15, x15, x19
+; NEON-AES-NEXT:    mul x29, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x800000000
+; NEON-AES-NEXT:    eor x15, x15, x22
+; NEON-AES-NEXT:    mul x9, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x1000000000
+; NEON-AES-NEXT:    eor x15, x15, x25
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    eor x15, x15, x29
+; NEON-AES-NEXT:    str x8, [sp, #64] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x2000000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    stp x8, x9, [sp, #32] // 16-byte Folded Spill
+; NEON-AES-NEXT:    and x8, x11, #0x4000000000
+; NEON-AES-NEXT:    and x9, x11, #0x400000000000000
+; NEON-AES-NEXT:    mul x27, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x8000000000
+; NEON-AES-NEXT:    mul x30, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x10000000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    mul x13, x10, x9
+; NEON-AES-NEXT:    rbit x9, x10
+; NEON-AES-NEXT:    str x8, [sp, #24] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x20000000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #56] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x40000000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #96] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x80000000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #112] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x100000000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #176] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x200000000000
+; NEON-AES-NEXT:    mul x8, x10, x8
+; NEON-AES-NEXT:    str x8, [sp, #192] // 8-byte Spill
+; NEON-AES-NEXT:    and x8, x11, #0x400000000000
+; NEON-AES-NEXT:    mul x17, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x800000000000
+; NEON-AES-NEXT:    mul x12, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x1000000000000
+; NEON-AES-NEXT:    mul x14, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x2000000000000
+; NEON-AES-NEXT:    mul x18, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x4000000000000
+; NEON-AES-NEXT:    eor x12, x17, x12
+; NEON-AES-NEXT:    mul x1, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x8000000000000
+; NEON-AES-NEXT:    eor x12, x12, x14
+; NEON-AES-NEXT:    mul x3, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x10000000000000
+; NEON-AES-NEXT:    eor x12, x12, x18
+; NEON-AES-NEXT:    mul x6, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x20000000000000
+; NEON-AES-NEXT:    eor x12, x12, x1
+; NEON-AES-NEXT:    mul x21, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x40000000000000
+; NEON-AES-NEXT:    eor x12, x12, x3
+; NEON-AES-NEXT:    mul x23, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x100000000000000
+; NEON-AES-NEXT:    eor x12, x12, x6
+; NEON-AES-NEXT:    mul x4, x10, x8
+; NEON-AES-NEXT:    and x8, x11, #0x200000000000000
+; NEON-AES-NEXT:    eor x12, x12, x21
+; NEON-AES-NEXT:    mul x2, x10, x8
+; NEON-AES-NEXT:    rbit x8, x11
+; NEON-AES-NEXT:    eor x12, x12, x23
+; NEON-AES-NEXT:    and x16, x8, #0x2
+; NEON-AES-NEXT:    and x14, x8, #0x800000000
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    mul x14, x9, x14
+; NEON-AES-NEXT:    str x16, [sp, #384] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x8, #0x1
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    str x16, [sp, #376] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x8, #0x4
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    str x16, [sp, #368] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x8, #0x8
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    str x16, [sp, #360] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x8, #0x10
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    str x16, [sp, #352] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x8, #0x20
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    str x16, [sp, #336] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x8, #0x40
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    str x16, [sp, #344] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x8, #0x80
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    str x16, [sp, #328] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x8, #0x100
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    str x16, [sp, #320] // 8-byte Spill
+; NEON-AES-NEXT:    and x16, x8, #0x200
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    str x16, [sp, #312] // 8-byte Spill
+; NEON-AES-NEXT:    eor x16, x0, x5
+; NEON-AES-NEXT:    ldr x0, [sp, #1368] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x5, [sp, #1360] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    and x5, x8, #0x400
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #1344] // 8-byte Reload
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    ldr x0, [sp, #1320] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x7, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1304] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x7, x0
+; NEON-AES-NEXT:    ldr x7, [sp, #1336] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1360] // 8-byte Spill
+; NEON-AES-NEXT:    and x5, x8, #0x800
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    ldr x7, [sp, #1312] // 8-byte Reload
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    ldr x0, [sp, #1296] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x7, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1248] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x7, x0
+; NEON-AES-NEXT:    ldr x7, [sp, #1288] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1352] // 8-byte Spill
+; NEON-AES-NEXT:    and x5, x8, #0x1000
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    ldr x7, [sp, #1272] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x7, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1256] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1336] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #1280] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    ldr x5, [sp, #1240] // 8-byte Reload
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    ldr x0, [sp, #1264] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x7, x5
+; NEON-AES-NEXT:    and x7, x8, #0x2000
+; NEON-AES-NEXT:    eor x0, x5, x0
+; NEON-AES-NEXT:    mul x5, x9, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #1176] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1328] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #1232] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    ldr x5, [sp, #1216] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1208] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x7
+; NEON-AES-NEXT:    and x7, x8, #0x4000
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    ldr x0, [sp, #1200] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x24
+; NEON-AES-NEXT:    mul x7, x9, x7
+; NEON-AES-NEXT:    ldr x24, [sp, #1120] // 8-byte Reload
+; NEON-AES-NEXT:    str x14, [sp, #1120] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x14, [sp, #112] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x5, x0
+; NEON-AES-NEXT:    and x5, x8, #0x8000
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    str x7, [sp, #1320] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x7, [sp, #1192] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #1168] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1312] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #1184] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x7, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1224] // 8-byte Reload
+; NEON-AES-NEXT:    str x12, [sp, #1184] // 8-byte Spill
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    ldr x5, [sp, #1112] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1160] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x7, x5
+; NEON-AES-NEXT:    and x7, x8, #0x10000
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    mul x7, x9, x7
+; NEON-AES-NEXT:    ldr x0, [sp, #1152] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1048] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x5, x0
+; NEON-AES-NEXT:    and x5, x8, #0x20000
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    str x7, [sp, #1304] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x7, [sp, #1136] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #1104] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1296] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #1128] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x7, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1144] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    ldr x5, [sp, #1040] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1080] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x7, x5
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    ldr x0, [sp, #1072] // 8-byte Reload
+; NEON-AES-NEXT:    and x7, x8, #0x40000
+; NEON-AES-NEXT:    eor x5, x5, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #968] // 8-byte Reload
+; NEON-AES-NEXT:    mul x7, x9, x7
+; NEON-AES-NEXT:    eor x0, x5, x0
+; NEON-AES-NEXT:    and x5, x8, #0x80000
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    str x7, [sp, #1288] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x7, [sp, #1064] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1280] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #1056] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #1008] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    ldr x5, [sp, #960] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x7, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1096] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x7, x5
+; NEON-AES-NEXT:    and x7, x8, #0x100000
+; NEON-AES-NEXT:    mul x7, x9, x7
+; NEON-AES-NEXT:    eor x0, x0, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #992] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1032] // 8-byte Reload
+; NEON-AES-NEXT:    str x7, [sp, #1272] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x7, [sp, #1088] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #984] // 8-byte Reload
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    ldr x0, [sp, #1000] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x7
+; NEON-AES-NEXT:    and x7, x8, #0x200000
+; NEON-AES-NEXT:    eor x0, x5, x0
+; NEON-AES-NEXT:    mul x5, x9, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #904] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1264] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #976] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    ldr x5, [sp, #944] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #936] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x7
+; NEON-AES-NEXT:    and x7, x8, #0x400000
+; NEON-AES-NEXT:    mul x7, x9, x7
+; NEON-AES-NEXT:    eor x5, x5, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #1016] // 8-byte Reload
+; NEON-AES-NEXT:    str x7, [sp, #1256] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x7, [sp, #1024] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #928] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #920] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x7
+; NEON-AES-NEXT:    and x7, x8, #0x800000
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    mul x7, x9, x7
+; NEON-AES-NEXT:    and x0, x8, #0x1000000
+; NEON-AES-NEXT:    str x16, [sp, #1368] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x16, [sp, #912] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #168] // 8-byte Reload
+; NEON-AES-NEXT:    mul x0, x9, x0
+; NEON-AES-NEXT:    eor x16, x5, x16
+; NEON-AES-NEXT:    ldr x5, [sp, #296] // 8-byte Reload
+; NEON-AES-NEXT:    str x7, [sp, #1240] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x7, [sp, #256] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x7, x5
+; NEON-AES-NEXT:    ldr x7, [sp, #224] // 8-byte Reload
+; NEON-AES-NEXT:    str x0, [sp, #1232] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x0, [sp, #952] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x7, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #216] // 8-byte Reload
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    eor x0, x5, x7
+; NEON-AES-NEXT:    and x5, x8, #0x2000000
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    ldr x7, [sp, #272] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x7, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #136] // 8-byte Reload
+; NEON-AES-NEXT:    eor x24, x24, x28
+; NEON-AES-NEXT:    str x5, [sp, #1224] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #248] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x7, x5
+; NEON-AES-NEXT:    ldr x7, [sp, #104] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    ldr x5, [sp, #160] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x24, x7
+; NEON-AES-NEXT:    and x24, x8, #0x4000000
+; NEON-AES-NEXT:    eor x5, x7, x5
+; NEON-AES-NEXT:    mul x7, x9, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #288] // 8-byte Reload
+; NEON-AES-NEXT:    str x7, [sp, #1216] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x7, [sp, #448] // 8-byte Reload
+; NEON-AES-NEXT:    eor x16, x16, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #208] // 8-byte Reload
+; NEON-AES-NEXT:    str x16, [sp, #1344] // 8-byte Spill
+; NEON-AES-NEXT:    eor x16, x0, x5
+; NEON-AES-NEXT:    and x0, x8, #0x8000000
+; NEON-AES-NEXT:    mul x0, x9, x0
+; NEON-AES-NEXT:    ldr x5, [sp, #264] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #48] // 8-byte Reload
+; NEON-AES-NEXT:    eor x7, x7, x26
+; NEON-AES-NEXT:    str x0, [sp, #1208] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x0, [sp, #240] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x5, x0
+; NEON-AES-NEXT:    ldr x5, [sp, #16] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x24
+; NEON-AES-NEXT:    ldr x24, [sp, #72] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x7, x5
+; NEON-AES-NEXT:    and x7, x8, #0x10000000
+; NEON-AES-NEXT:    mul x7, x9, x7
+; NEON-AES-NEXT:    eor x5, x5, x24
+; NEON-AES-NEXT:    str x7, [sp, #1192] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x7, [sp, #304] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #80] // 8-byte Reload
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    ldr x0, [sp, #152] // 8-byte Reload
+; NEON-AES-NEXT:    eor x5, x5, x7
+; NEON-AES-NEXT:    and x7, x8, #0x20000000
+; NEON-AES-NEXT:    eor x0, x5, x0
+; NEON-AES-NEXT:    mul x5, x9, x7
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    ldr x0, [sp, #184] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1176] // 8-byte Spill
+; NEON-AES-NEXT:    ldp x7, x5, [sp, #120] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    and x5, x8, #0x40000000
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #200] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1160] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #144] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    and x5, x8, #0x80000000
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    ldr x7, [sp, #280] // 8-byte Reload
+; NEON-AES-NEXT:    str x5, [sp, #1152] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #232] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x0, x5
+; NEON-AES-NEXT:    and x5, x8, #0x100000000
+; NEON-AES-NEXT:    mul x5, x9, x5
+; NEON-AES-NEXT:    eor x0, x0, x7
+; NEON-AES-NEXT:    eor x16, x16, x0
+; NEON-AES-NEXT:    ldr x0, [sp, #40] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x0
+; NEON-AES-NEXT:    and x0, x8, #0x200000000
+; NEON-AES-NEXT:    mul x0, x9, x0
+; NEON-AES-NEXT:    str x5, [sp, #1144] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x5, [sp, #64] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x5
+; NEON-AES-NEXT:    ldr x5, [sp, #32] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x16, x15
+; NEON-AES-NEXT:    and x16, x8, #0x400000000
+; NEON-AES-NEXT:    mul x16, x9, x16
+; NEON-AES-NEXT:    eor x5, x5, x27
+; NEON-AES-NEXT:    str x0, [sp, #1136] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x0, [sp, #24] // 8-byte Reload
+; NEON-AES-NEXT:    str x15, [sp, #1248] // 8-byte Spill
+; NEON-AES-NEXT:    eor x15, x5, x30
+; NEON-AES-NEXT:    eor x15, x15, x0
+; NEON-AES-NEXT:    str x16, [sp, #1128] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x16, [sp, #56] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #96] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #176] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x15, x14
+; NEON-AES-NEXT:    and x15, x8, #0x1000000000
+; NEON-AES-NEXT:    mul x15, x9, x15
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #832] // 8-byte Reload
+; NEON-AES-NEXT:    str x15, [sp, #1112] // 8-byte Spill
+; NEON-AES-NEXT:    ldr x15, [sp, #192] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    eor x15, x4, x2
+; NEON-AES-NEXT:    eor x12, x15, x13
+; NEON-AES-NEXT:    str x14, [sp, #1200] // 8-byte Spill
+; NEON-AES-NEXT:    and x14, x8, #0x2000000000
+; NEON-AES-NEXT:    str x12, [sp, #1168] // 8-byte Spill
+; NEON-AES-NEXT:    and x12, x8, #0x4000000000
+; NEON-AES-NEXT:    mul x28, x9, x14
+; NEON-AES-NEXT:    ldr x13, [sp, #896] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x14, [sp, #888] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x15, [sp, #872] // 8-byte Reload
+; NEON-AES-NEXT:    mul x12, x9, x12
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    ldr x14, [sp, #880] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #848] // 8-byte Reload
+; NEON-AES-NEXT:    str x12, [sp, #1104] // 8-byte Spill
+; NEON-AES-NEXT:    eor x12, x13, x14
+; NEON-AES-NEXT:    ldr x13, [sp, #864] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x14, [sp, #856] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    and x14, x8, #0x8000000000
+; NEON-AES-NEXT:    mul x14, x9, x14
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #840] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #824] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #800] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x15, x13
+; NEON-AES-NEXT:    ldr x15, [sp, #816] // 8-byte Reload
+; NEON-AES-NEXT:    str x14, [sp, #1096] // 8-byte Spill
+; NEON-AES-NEXT:    and x14, x8, #0x10000000000
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    mul x14, x9, x14
+; NEON-AES-NEXT:    ldr x15, [sp, #808] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #792] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #744] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x15, x13
+; NEON-AES-NEXT:    ldr x15, [sp, #784] // 8-byte Reload
+; NEON-AES-NEXT:    str x14, [sp, #1088] // 8-byte Spill
+; NEON-AES-NEXT:    and x14, x8, #0x20000000000
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    mul x26, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #776] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x15, [sp, #768] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #736] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #760] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x16, [sp, #752] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x15, x14
+; NEON-AES-NEXT:    and x15, x8, #0x40000000000
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    ldr x14, [sp, #728] // 8-byte Reload
+; NEON-AES-NEXT:    mul x25, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #672] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #720] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #704] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    and x15, x8, #0x80000000000
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #696] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    mul x27, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #688] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x16, [sp, #616] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    and x14, x8, #0x100000000000
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #664] // 8-byte Reload
+; NEON-AES-NEXT:    mul x29, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #680] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #712] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #608] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #632] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x15, x14
+; NEON-AES-NEXT:    and x15, x8, #0x200000000000
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #624] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    mul x30, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #656] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x16, [sp, #576] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    and x14, x8, #0x400000000000
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #600] // 8-byte Reload
+; NEON-AES-NEXT:    mul x24, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #648] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #640] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #568] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #560] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x15, x14
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #552] // 8-byte Reload
+; NEON-AES-NEXT:    and x15, x8, #0x800000000000
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #464] // 8-byte Reload
+; NEON-AES-NEXT:    mul x23, x9, x15
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    and x14, x8, #0x1000000000000
+; NEON-AES-NEXT:    ldr x15, [sp, #544] // 8-byte Reload
+; NEON-AES-NEXT:    mul x22, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #536] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #520] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #456] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #592] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x15, x14
+; NEON-AES-NEXT:    and x15, x8, #0x2000000000000
+; NEON-AES-NEXT:    mul x20, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #584] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #488] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #480] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldp x16, x13, [sp, #504] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    and x15, x8, #0x4000000000000
+; NEON-AES-NEXT:    mul x7, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #400] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    ldr x14, [sp, #472] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #440] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #424] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    and x15, x8, #0x8000000000000
+; NEON-AES-NEXT:    mul x6, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #496] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #528] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #416] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #432] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x15
+; NEON-AES-NEXT:    eor x21, x12, x13
+; NEON-AES-NEXT:    ldr x12, [sp, #408] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x16
+; NEON-AES-NEXT:    and x13, x8, #0x20000000000000
+; NEON-AES-NEXT:    and x15, x8, #0x10000000000000
+; NEON-AES-NEXT:    eor x12, x14, x12
+; NEON-AES-NEXT:    ldr x14, [sp, #392] // 8-byte Reload
+; NEON-AES-NEXT:    mul x4, x9, x13
+; NEON-AES-NEXT:    ldr x16, [sp, #1336] // 8-byte Reload
+; NEON-AES-NEXT:    eor x19, x12, x14
+; NEON-AES-NEXT:    ldp x14, x12, [sp, #376] // 16-byte Folded Reload
+; NEON-AES-NEXT:    mul x5, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #336] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x12
+; NEON-AES-NEXT:    ldp x13, x12, [sp, #360] // 16-byte Folded Reload
+; NEON-AES-NEXT:    eor x13, x12, x13
+; NEON-AES-NEXT:    ldr x12, [sp, #352] // 8-byte Reload
+; NEON-AES-NEXT:    eor x14, x14, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #344] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x15
+; NEON-AES-NEXT:    and x15, x8, #0x40000000000000
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    mul x3, x9, x15
+; NEON-AES-NEXT:    and x15, x8, #0x80000000000000
+; NEON-AES-NEXT:    eor x14, x14, x12
+; NEON-AES-NEXT:    ldp x13, x12, [sp, #320] // 16-byte Folded Reload
+; NEON-AES-NEXT:    mul x2, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #1360] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #312] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #1352] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #1328] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    eor x12, x14, x12
+; NEON-AES-NEXT:    ldr x14, [sp, #1320] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x15
+; NEON-AES-NEXT:    and x15, x8, #0x100000000000000
+; NEON-AES-NEXT:    ldr x16, [sp, #1272] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1312] // 8-byte Reload
+; NEON-AES-NEXT:    mul x1, x9, x15
+; NEON-AES-NEXT:    ldr x15, [sp, #1296] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1304] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #1288] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x14, x15
+; NEON-AES-NEXT:    and x14, x8, #0x200000000000000
+; NEON-AES-NEXT:    mul x0, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1280] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x15, x13
+; NEON-AES-NEXT:    ldr x15, [sp, #1240] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1256] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #1232] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x14, x15
+; NEON-AES-NEXT:    and x14, x8, #0x400000000000000
+; NEON-AES-NEXT:    mul x18, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1264] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #1192] // 8-byte Reload
+; NEON-AES-NEXT:    eor x0, x1, x0
+; NEON-AES-NEXT:    and x1, x11, #0x2000000000000000
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1224] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x13
+; NEON-AES-NEXT:    ldr x13, [sp, #1216] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x15, x14
+; NEON-AES-NEXT:    and x14, x8, #0x800000000000000
+; NEON-AES-NEXT:    mul x17, x9, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1208] // 8-byte Reload
+; NEON-AES-NEXT:    eor x13, x15, x13
+; NEON-AES-NEXT:    ldr x15, [sp, #1160] // 8-byte Reload
+; NEON-AES-NEXT:    eor x18, x0, x18
+; NEON-AES-NEXT:    eor x13, x13, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1176] // 8-byte Reload
+; NEON-AES-NEXT:    mul x0, x10, x1
+; NEON-AES-NEXT:    eor x13, x13, x16
+; NEON-AES-NEXT:    ldr x16, [sp, #1152] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x14, x15
+; NEON-AES-NEXT:    and x14, x8, #0x1000000000000000
+; NEON-AES-NEXT:    eor x15, x15, x16
+; NEON-AES-NEXT:    mul x16, x9, x14
+; NEON-AES-NEXT:    eor x14, x12, x13
+; NEON-AES-NEXT:    ldr x12, [sp, #1144] // 8-byte Reload
+; NEON-AES-NEXT:    eor x17, x18, x17
+; NEON-AES-NEXT:    and x18, x11, #0x4000000000000000
+; NEON-AES-NEXT:    eor x13, x15, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1136] // 8-byte Reload
+; NEON-AES-NEXT:    and x15, x8, #0x2000000000000000
+; NEON-AES-NEXT:    and x8, x8, #0x4000000000000000
+; NEON-AES-NEXT:    mul x15, x9, x15
+; NEON-AES-NEXT:    eor x13, x13, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1104] // 8-byte Reload
+; NEON-AES-NEXT:    eor x16, x17, x16
+; NEON-AES-NEXT:    mul x8, x9, x8
+; NEON-AES-NEXT:    ldr x9, [sp, #1112] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x28, x12
+; NEON-AES-NEXT:    ldr x28, [sp, #1128] // 8-byte Reload
+; NEON-AES-NEXT:    mul x17, x10, x18
+; NEON-AES-NEXT:    eor x13, x13, x28
+; NEON-AES-NEXT:    ldr x28, [sp, #1096] // 8-byte Reload
+; NEON-AES-NEXT:    eor x15, x16, x15
+; NEON-AES-NEXT:    eor x12, x12, x28
+; NEON-AES-NEXT:    ldr x28, [sp, #1120] // 8-byte Reload
+; NEON-AES-NEXT:    eor x8, x15, x8
+; NEON-AES-NEXT:    eor x13, x13, x28
+; NEON-AES-NEXT:    ldr x28, [sp, #1088] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x13, x9
+; NEON-AES-NEXT:    and x13, x11, #0x80000000000000
+; NEON-AES-NEXT:    eor x12, x12, x28
+; NEON-AES-NEXT:    eor x9, x14, x9
+; NEON-AES-NEXT:    eor x14, x24, x23
+; NEON-AES-NEXT:    eor x12, x12, x26
+; NEON-AES-NEXT:    eor x14, x14, x22
+; NEON-AES-NEXT:    and x23, x11, #0x800000000000000
+; NEON-AES-NEXT:    eor x12, x12, x25
+; NEON-AES-NEXT:    eor x14, x14, x20
+; NEON-AES-NEXT:    mul x22, x10, x23
+; NEON-AES-NEXT:    eor x12, x12, x27
+; NEON-AES-NEXT:    and x20, x11, #0x1000000000000000
+; NEON-AES-NEXT:    and x11, x11, #0x8000000000000000
+; NEON-AES-NEXT:    eor x12, x12, x29
+; NEON-AES-NEXT:    mul x13, x10, x13
+; NEON-AES-NEXT:    eor x12, x12, x30
+; NEON-AES-NEXT:    eor x9, x9, x12
+; NEON-AES-NEXT:    eor x12, x14, x7
+; NEON-AES-NEXT:    mul x14, x10, x20
+; NEON-AES-NEXT:    eor x12, x12, x6
+; NEON-AES-NEXT:    eor x12, x12, x5
+; NEON-AES-NEXT:    mul x10, x10, x11
+; NEON-AES-NEXT:    ldr x11, [sp, #1168] // 8-byte Reload
+; NEON-AES-NEXT:    eor x12, x12, x4
+; NEON-AES-NEXT:    eor x12, x12, x3
+; NEON-AES-NEXT:    eor x11, x11, x22
+; NEON-AES-NEXT:    eor x12, x12, x2
+; NEON-AES-NEXT:    eor x9, x9, x12
+; NEON-AES-NEXT:    eor x12, x21, x19
+; NEON-AES-NEXT:    eor x8, x9, x8
+; NEON-AES-NEXT:    eor x9, x11, x14
+; NEON-AES-NEXT:    rbit x11, x12
+; NEON-AES-NEXT:    ldr x12, [sp, #1248] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x14, [sp, #1200] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x0
+; NEON-AES-NEXT:    rbit x8, x8
+; NEON-AES-NEXT:    eor x9, x9, x17
+; NEON-AES-NEXT:    lsr x1, x11, #1
+; NEON-AES-NEXT:    eor x12, x12, x14
+; NEON-AES-NEXT:    ldr x14, [sp, #1184] // 8-byte Reload
+; NEON-AES-NEXT:    eor x9, x9, x10
+; NEON-AES-NEXT:    ldr x10, [sp, #1368] // 8-byte Reload
+; NEON-AES-NEXT:    ldr x11, [sp, #1344] // 8-byte Reload
+; NEON-AES-NEXT:    lsr x3, x8, #1
+; NEON-AES-NEXT:    eor x13, x14, x13
+; NEON-AES-NEXT:    eor x8, x12, x13
+; NEON-AES-NEXT:    eor x0, x10, x11
+; NEON-AES-NEXT:    eor x2, x8, x9
+; NEON-AES-NEXT:    add sp, sp, #1376
+; NEON-AES-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NEON-AES-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NEON-AES-NEXT:    ret
+  %zextx = zext <2 x i64> %x to <2 x i128>
+  %zexty = zext <2 x i64> %y to <2 x i128>
+  %a = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %zextx, <2 x i128> %zexty)
+  ret <2 x i128> %a
+}
 
-attributes #0 = { "target-features"="+aes" }
\ No newline at end of file
+attributes #0 = { "target-features"="+aes" }
diff --git a/llvm/test/CodeGen/AArch64/clmul-scalable.ll b/llvm/test/CodeGen/AArch64/clmul-scalable.ll
index 889bfccace0e8..58dbba833f67b 100644
--- a/llvm/test/CodeGen/AArch64/clmul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-scalable.ll
@@ -1,1172 +1,2242 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-unknown-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-unknown-gnu %s -o - -mattr=+sve | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: llc -mtriple=aarch64-linux-unknown-gnu %s -o - -mattr=+sve2 | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: llc -mtriple=aarch64-linux-unknown-gnu %s -o - -mattr=+sve2-aes | FileCheck %s --check-prefix=CHECK-SVE2-AES
 
-define <vscale x 16 x i8> @clmul_nxv16i8_sve(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) #0 {
-; CHECK-LABEL: clmul_nxv16i8_sve:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    and z1.b, z1.b, #0x80
-; CHECK-NEXT:    and z2.b, z2.b, #0x2
-; CHECK-NEXT:    and z3.b, z3.b, #0x1
-; CHECK-NEXT:    and z4.b, z4.b, #0x4
-; CHECK-NEXT:    and z5.b, z5.b, #0x8
-; CHECK-NEXT:    and z6.b, z6.b, #0x10
-; CHECK-NEXT:    and z7.b, z7.b, #0x20
-; CHECK-NEXT:    and z24.b, z24.b, #0x40
-; CHECK-NEXT:    mul z2.b, p0/m, z2.b, z0.b
-; CHECK-NEXT:    mul z3.b, p0/m, z3.b, z0.b
-; CHECK-NEXT:    mul z4.b, p0/m, z4.b, z0.b
-; CHECK-NEXT:    mul z5.b, p0/m, z5.b, z0.b
-; CHECK-NEXT:    mul z6.b, p0/m, z6.b, z0.b
-; CHECK-NEXT:    mul z7.b, p0/m, z7.b, z0.b
-; CHECK-NEXT:    mul z24.b, p0/m, z24.b, z0.b
-; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    eor z3.d, z4.d, z5.d
-; CHECK-NEXT:    eor z4.d, z6.d, z7.d
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    eor z3.d, z4.d, z24.d
-; CHECK-NEXT:    eor z1.d, z2.d, z3.d
-; CHECK-NEXT:    eor z0.d, z1.d, z0.d
-; CHECK-NEXT:    ret
+define <vscale x 16 x i8> @clmul_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK-SVE-LABEL: clmul_nxv16i8:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov z2.d, z1.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    ptrue p0.b
+; CHECK-SVE-NEXT:    and z1.b, z1.b, #0x80
+; CHECK-SVE-NEXT:    and z2.b, z2.b, #0x2
+; CHECK-SVE-NEXT:    and z3.b, z3.b, #0x1
+; CHECK-SVE-NEXT:    and z4.b, z4.b, #0x4
+; CHECK-SVE-NEXT:    and z5.b, z5.b, #0x8
+; CHECK-SVE-NEXT:    and z6.b, z6.b, #0x10
+; CHECK-SVE-NEXT:    and z7.b, z7.b, #0x20
+; CHECK-SVE-NEXT:    and z24.b, z24.b, #0x40
+; CHECK-SVE-NEXT:    mul z2.b, p0/m, z2.b, z0.b
+; CHECK-SVE-NEXT:    mul z3.b, p0/m, z3.b, z0.b
+; CHECK-SVE-NEXT:    mul z4.b, p0/m, z4.b, z0.b
+; CHECK-SVE-NEXT:    mul z5.b, p0/m, z5.b, z0.b
+; CHECK-SVE-NEXT:    mul z6.b, p0/m, z6.b, z0.b
+; CHECK-SVE-NEXT:    mul z7.b, p0/m, z7.b, z0.b
+; CHECK-SVE-NEXT:    mul z24.b, p0/m, z24.b, z0.b
+; CHECK-SVE-NEXT:    mul z0.b, p0/m, z0.b, z1.b
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
+; CHECK-SVE-NEXT:    eor z4.d, z6.d, z7.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z24.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2-LABEL: clmul_nxv16i8:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    pmul z0.b, z0.b, z1.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-AES-LABEL: clmul_nxv16i8:
+; CHECK-SVE2-AES:       // %bb.0:
+; CHECK-SVE2-AES-NEXT:    pmul z0.b, z0.b, z1.b
+; CHECK-SVE2-AES-NEXT:    ret
   %a = call <vscale x 16 x i8> @llvm.clmul.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 8 x i16> @clmul_nxv8i16_sve(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) #0 {
-; CHECK-LABEL: clmul_nxv8i16_sve:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    and z2.h, z2.h, #0x2
-; CHECK-NEXT:    and z3.h, z3.h, #0x1
-; CHECK-NEXT:    and z4.h, z4.h, #0x4
-; CHECK-NEXT:    and z5.h, z5.h, #0x8
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    and z6.h, z6.h, #0x10
-; CHECK-NEXT:    and z7.h, z7.h, #0x20
-; CHECK-NEXT:    and z24.h, z24.h, #0x80
-; CHECK-NEXT:    mul z2.h, p0/m, z2.h, z0.h
-; CHECK-NEXT:    mul z3.h, p0/m, z3.h, z0.h
-; CHECK-NEXT:    and z25.h, z25.h, #0x100
-; CHECK-NEXT:    mul z4.h, p0/m, z4.h, z0.h
-; CHECK-NEXT:    mul z5.h, p0/m, z5.h, z0.h
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    mul z6.h, p0/m, z6.h, z0.h
-; CHECK-NEXT:    mul z7.h, p0/m, z7.h, z0.h
-; CHECK-NEXT:    and z26.h, z26.h, #0x800
-; CHECK-NEXT:    mul z24.h, p0/m, z24.h, z0.h
-; CHECK-NEXT:    mul z25.h, p0/m, z25.h, z0.h
-; CHECK-NEXT:    and z27.h, z27.h, #0x1000
-; CHECK-NEXT:    mov z29.d, z1.d
-; CHECK-NEXT:    mov z30.d, z1.d
-; CHECK-NEXT:    and z28.h, z28.h, #0x40
-; CHECK-NEXT:    mul z26.h, p0/m, z26.h, z0.h
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    eor z3.d, z4.d, z5.d
-; CHECK-NEXT:    mul z27.h, p0/m, z27.h, z0.h
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z28.h, p0/m, z28.h, z0.h
-; CHECK-NEXT:    and z29.h, z29.h, #0x200
-; CHECK-NEXT:    and z30.h, z30.h, #0x2000
-; CHECK-NEXT:    eor z6.d, z6.d, z7.d
-; CHECK-NEXT:    eor z7.d, z24.d, z25.d
-; CHECK-NEXT:    and z1.h, z1.h, #0x8000
-; CHECK-NEXT:    and z4.h, z4.h, #0x400
-; CHECK-NEXT:    and z5.h, z5.h, #0x4000
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    mul z29.h, p0/m, z29.h, z0.h
-; CHECK-NEXT:    mul z30.h, p0/m, z30.h, z0.h
-; CHECK-NEXT:    eor z24.d, z26.d, z27.d
-; CHECK-NEXT:    eor z3.d, z6.d, z28.d
-; CHECK-NEXT:    mul z4.h, p0/m, z4.h, z0.h
-; CHECK-NEXT:    mul z5.h, p0/m, z5.h, z0.h
-; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    eor z6.d, z7.d, z29.d
-; CHECK-NEXT:    eor z7.d, z24.d, z30.d
-; CHECK-NEXT:    eor z1.d, z2.d, z3.d
-; CHECK-NEXT:    eor z2.d, z6.d, z4.d
-; CHECK-NEXT:    eor z3.d, z7.d, z5.d
-; CHECK-NEXT:    eor z1.d, z1.d, z2.d
-; CHECK-NEXT:    eor z0.d, z3.d, z0.d
-; CHECK-NEXT:    eor z0.d, z1.d, z0.d
-; CHECK-NEXT:    ret
+define <vscale x 8 x i16> @clmul_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-SVE-LABEL: clmul_nxv8i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov z2.d, z1.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    ptrue p0.h
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    and z2.h, z2.h, #0x2
+; CHECK-SVE-NEXT:    and z3.h, z3.h, #0x1
+; CHECK-SVE-NEXT:    and z4.h, z4.h, #0x4
+; CHECK-SVE-NEXT:    and z5.h, z5.h, #0x8
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    and z6.h, z6.h, #0x10
+; CHECK-SVE-NEXT:    and z7.h, z7.h, #0x20
+; CHECK-SVE-NEXT:    and z24.h, z24.h, #0x80
+; CHECK-SVE-NEXT:    mul z2.h, p0/m, z2.h, z0.h
+; CHECK-SVE-NEXT:    mul z3.h, p0/m, z3.h, z0.h
+; CHECK-SVE-NEXT:    and z25.h, z25.h, #0x100
+; CHECK-SVE-NEXT:    mul z4.h, p0/m, z4.h, z0.h
+; CHECK-SVE-NEXT:    mul z5.h, p0/m, z5.h, z0.h
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    mul z6.h, p0/m, z6.h, z0.h
+; CHECK-SVE-NEXT:    mul z7.h, p0/m, z7.h, z0.h
+; CHECK-SVE-NEXT:    and z26.h, z26.h, #0x800
+; CHECK-SVE-NEXT:    mul z24.h, p0/m, z24.h, z0.h
+; CHECK-SVE-NEXT:    mul z25.h, p0/m, z25.h, z0.h
+; CHECK-SVE-NEXT:    and z27.h, z27.h, #0x1000
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    mov z30.d, z1.d
+; CHECK-SVE-NEXT:    and z28.h, z28.h, #0x40
+; CHECK-SVE-NEXT:    mul z26.h, p0/m, z26.h, z0.h
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
+; CHECK-SVE-NEXT:    mul z27.h, p0/m, z27.h, z0.h
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mul z28.h, p0/m, z28.h, z0.h
+; CHECK-SVE-NEXT:    and z29.h, z29.h, #0x200
+; CHECK-SVE-NEXT:    and z30.h, z30.h, #0x2000
+; CHECK-SVE-NEXT:    eor z6.d, z6.d, z7.d
+; CHECK-SVE-NEXT:    eor z7.d, z24.d, z25.d
+; CHECK-SVE-NEXT:    and z1.h, z1.h, #0x8000
+; CHECK-SVE-NEXT:    and z4.h, z4.h, #0x400
+; CHECK-SVE-NEXT:    and z5.h, z5.h, #0x4000
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    mul z29.h, p0/m, z29.h, z0.h
+; CHECK-SVE-NEXT:    mul z30.h, p0/m, z30.h, z0.h
+; CHECK-SVE-NEXT:    eor z24.d, z26.d, z27.d
+; CHECK-SVE-NEXT:    eor z3.d, z6.d, z28.d
+; CHECK-SVE-NEXT:    mul z4.h, p0/m, z4.h, z0.h
+; CHECK-SVE-NEXT:    mul z5.h, p0/m, z5.h, z0.h
+; CHECK-SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
+; CHECK-SVE-NEXT:    eor z6.d, z7.d, z29.d
+; CHECK-SVE-NEXT:    eor z7.d, z24.d, z30.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z2.d, z6.d, z4.d
+; CHECK-SVE-NEXT:    eor z3.d, z7.d, z5.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z2.d
+; CHECK-SVE-NEXT:    eor z0.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2-LABEL: clmul_nxv8i16:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0x2
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0x1
+; CHECK-SVE2-NEXT:    and z4.h, z4.h, #0x8
+; CHECK-SVE2-NEXT:    and z5.h, z5.h, #0x4
+; CHECK-SVE2-NEXT:    and z6.h, z6.h, #0x20
+; CHECK-SVE2-NEXT:    mul z2.h, z0.h, z2.h
+; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-NEXT:    mul z4.h, z0.h, z4.h
+; CHECK-SVE2-NEXT:    mul z5.h, z0.h, z5.h
+; CHECK-SVE2-NEXT:    mul z6.h, z0.h, z6.h
+; CHECK-SVE2-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0x10
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-NEXT:    and z4.h, z4.h, #0x80
+; CHECK-SVE2-NEXT:    and z5.h, z5.h, #0x40
+; CHECK-SVE2-NEXT:    mul z4.h, z0.h, z4.h
+; CHECK-SVE2-NEXT:    mul z5.h, z0.h, z5.h
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0x200
+; CHECK-SVE2-NEXT:    and z6.h, z6.h, #0x100
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-NEXT:    mul z6.h, z0.h, z6.h
+; CHECK-SVE2-NEXT:    and z4.h, z4.h, #0x800
+; CHECK-SVE2-NEXT:    and z5.h, z5.h, #0x400
+; CHECK-SVE2-NEXT:    mul z4.h, z0.h, z4.h
+; CHECK-SVE2-NEXT:    mul z5.h, z0.h, z5.h
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0x2000
+; CHECK-SVE2-NEXT:    and z6.h, z6.h, #0x1000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0x4000
+; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-NEXT:    mul z4.h, z0.h, z6.h
+; CHECK-SVE2-NEXT:    and z5.h, z5.h, #0x8000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z5.h
+; CHECK-SVE2-NEXT:    mul z0.h, z0.h, z1.h
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-AES-LABEL: clmul_nxv8i16:
+; CHECK-SVE2-AES:       // %bb.0:
+; CHECK-SVE2-AES-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z2.h, z2.h, #0x2
+; CHECK-SVE2-AES-NEXT:    and z3.h, z3.h, #0x1
+; CHECK-SVE2-AES-NEXT:    and z4.h, z4.h, #0x8
+; CHECK-SVE2-AES-NEXT:    and z5.h, z5.h, #0x4
+; CHECK-SVE2-AES-NEXT:    and z6.h, z6.h, #0x20
+; CHECK-SVE2-AES-NEXT:    mul z2.h, z0.h, z2.h
+; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-AES-NEXT:    mul z4.h, z0.h, z4.h
+; CHECK-SVE2-AES-NEXT:    mul z5.h, z0.h, z5.h
+; CHECK-SVE2-AES-NEXT:    mul z6.h, z0.h, z6.h
+; CHECK-SVE2-AES-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.h, z3.h, #0x10
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-AES-NEXT:    and z4.h, z4.h, #0x80
+; CHECK-SVE2-AES-NEXT:    and z5.h, z5.h, #0x40
+; CHECK-SVE2-AES-NEXT:    mul z4.h, z0.h, z4.h
+; CHECK-SVE2-AES-NEXT:    mul z5.h, z0.h, z5.h
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.h, z3.h, #0x200
+; CHECK-SVE2-AES-NEXT:    and z6.h, z6.h, #0x100
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-AES-NEXT:    mul z6.h, z0.h, z6.h
+; CHECK-SVE2-AES-NEXT:    and z4.h, z4.h, #0x800
+; CHECK-SVE2-AES-NEXT:    and z5.h, z5.h, #0x400
+; CHECK-SVE2-AES-NEXT:    mul z4.h, z0.h, z4.h
+; CHECK-SVE2-AES-NEXT:    mul z5.h, z0.h, z5.h
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.h, z3.h, #0x2000
+; CHECK-SVE2-AES-NEXT:    and z6.h, z6.h, #0x1000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z1.h, z1.h, #0x4000
+; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-AES-NEXT:    mul z4.h, z0.h, z6.h
+; CHECK-SVE2-AES-NEXT:    and z5.h, z5.h, #0x8000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z5.h
+; CHECK-SVE2-AES-NEXT:    mul z0.h, z0.h, z1.h
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-AES-NEXT:    ret
   %a = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 4 x i32> @clmul_nxv4i32_sve(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) #0 {
-; CHECK-LABEL: clmul_nxv4i32_sve:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    and z2.s, z2.s, #0x2
-; CHECK-NEXT:    and z3.s, z3.s, #0x1
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    and z4.s, z4.s, #0x4
-; CHECK-NEXT:    and z5.s, z5.s, #0x8
-; CHECK-NEXT:    and z6.s, z6.s, #0x10
-; CHECK-NEXT:    and z7.s, z7.s, #0x20
-; CHECK-NEXT:    and z24.s, z24.s, #0x80
-; CHECK-NEXT:    and z25.s, z25.s, #0x100
-; CHECK-NEXT:    mul z2.s, p0/m, z2.s, z0.s
-; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z0.s
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    mul z4.s, p0/m, z4.s, z0.s
-; CHECK-NEXT:    mul z5.s, p0/m, z5.s, z0.s
-; CHECK-NEXT:    and z26.s, z26.s, #0x40
-; CHECK-NEXT:    mul z6.s, p0/m, z6.s, z0.s
-; CHECK-NEXT:    mul z7.s, p0/m, z7.s, z0.s
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    mul z24.s, p0/m, z24.s, z0.s
-; CHECK-NEXT:    mul z25.s, p0/m, z25.s, z0.s
-; CHECK-NEXT:    and z27.s, z27.s, #0x200
-; CHECK-NEXT:    mul z26.s, p0/m, z26.s, z0.s
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    eor z4.d, z4.d, z5.d
-; CHECK-NEXT:    mov z29.d, z1.d
-; CHECK-NEXT:    and z28.s, z28.s, #0x8000
-; CHECK-NEXT:    mul z27.s, p0/m, z27.s, z0.s
-; CHECK-NEXT:    eor z5.d, z6.d, z7.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    and z3.s, z3.s, #0x400
-; CHECK-NEXT:    eor z6.d, z24.d, z25.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mul z28.s, p0/m, z28.s, z0.s
-; CHECK-NEXT:    eor z2.d, z2.d, z4.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    eor z4.d, z5.d, z26.d
-; CHECK-NEXT:    and z7.s, z7.s, #0x800
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z0.s
-; CHECK-NEXT:    eor z5.d, z6.d, z27.d
-; CHECK-NEXT:    and z24.s, z24.s, #0x1000
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z25.s, z25.s, #0x800000
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    mul z7.s, p0/m, z7.s, z0.s
-; CHECK-NEXT:    eor z2.d, z2.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mul z24.s, p0/m, z24.s, z0.s
-; CHECK-NEXT:    and z26.s, z26.s, #0x40000
-; CHECK-NEXT:    and z29.s, z29.s, #0x100000
-; CHECK-NEXT:    mul z25.s, p0/m, z25.s, z0.s
-; CHECK-NEXT:    eor z3.d, z5.d, z3.d
-; CHECK-NEXT:    and z6.s, z6.s, #0x2000
-; CHECK-NEXT:    and z4.s, z4.s, #0x10000
-; CHECK-NEXT:    and z27.s, z27.s, #0x1000000
-; CHECK-NEXT:    mul z26.s, p0/m, z26.s, z0.s
-; CHECK-NEXT:    mul z29.s, p0/m, z29.s, z0.s
-; CHECK-NEXT:    mul z6.s, p0/m, z6.s, z0.s
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mul z4.s, p0/m, z4.s, z0.s
-; CHECK-NEXT:    mul z27.s, p0/m, z27.s, z0.s
-; CHECK-NEXT:    eor z5.d, z7.d, z24.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    and z3.s, z3.s, #0x20000
-; CHECK-NEXT:    eor z5.d, z5.d, z6.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z24.s, z24.s, #0x400000
-; CHECK-NEXT:    and z7.s, z7.s, #0x4000
-; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z0.s
-; CHECK-NEXT:    and z6.s, z6.s, #0x80000
-; CHECK-NEXT:    mul z24.s, p0/m, z24.s, z0.s
-; CHECK-NEXT:    mul z7.s, p0/m, z7.s, z0.s
-; CHECK-NEXT:    eor z3.d, z4.d, z3.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mul z6.s, p0/m, z6.s, z0.s
-; CHECK-NEXT:    eor z24.d, z24.d, z25.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    eor z5.d, z5.d, z7.d
-; CHECK-NEXT:    and z4.s, z4.s, #0x2000000
-; CHECK-NEXT:    eor z3.d, z3.d, z26.d
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    and z25.s, z25.s, #0x4000000
-; CHECK-NEXT:    eor z7.d, z24.d, z27.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mul z4.s, p0/m, z4.s, z0.s
-; CHECK-NEXT:    eor z3.d, z3.d, z6.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z26.s, z26.s, #0x200000
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    eor z5.d, z5.d, z28.d
-; CHECK-NEXT:    mul z25.s, p0/m, z25.s, z0.s
-; CHECK-NEXT:    and z24.s, z24.s, #0x20000000
-; CHECK-NEXT:    and z6.s, z6.s, #0x8000000
-; CHECK-NEXT:    eor z3.d, z3.d, z29.d
-; CHECK-NEXT:    eor z4.d, z7.d, z4.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    mul z26.s, p0/m, z26.s, z0.s
-; CHECK-NEXT:    and z27.s, z27.s, #0x40000000
-; CHECK-NEXT:    mul z24.s, p0/m, z24.s, z0.s
-; CHECK-NEXT:    and z1.s, z1.s, #0x80000000
-; CHECK-NEXT:    mul z6.s, p0/m, z6.s, z0.s
-; CHECK-NEXT:    eor z2.d, z2.d, z5.d
-; CHECK-NEXT:    and z7.s, z7.s, #0x10000000
-; CHECK-NEXT:    eor z4.d, z4.d, z25.d
-; CHECK-NEXT:    mul z27.s, p0/m, z27.s, z0.s
-; CHECK-NEXT:    eor z3.d, z3.d, z26.d
-; CHECK-NEXT:    mul z7.s, p0/m, z7.s, z0.s
-; CHECK-NEXT:    eor z4.d, z4.d, z6.d
-; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    eor z1.d, z2.d, z3.d
-; CHECK-NEXT:    eor z3.d, z24.d, z27.d
-; CHECK-NEXT:    eor z2.d, z4.d, z7.d
-; CHECK-NEXT:    eor z0.d, z3.d, z0.d
-; CHECK-NEXT:    eor z1.d, z1.d, z2.d
-; CHECK-NEXT:    eor z0.d, z1.d, z0.d
-; CHECK-NEXT:    ret
+define <vscale x 4 x i32> @clmul_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-SVE-LABEL: clmul_nxv4i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov z2.d, z1.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    ptrue p0.s
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    and z2.s, z2.s, #0x2
+; CHECK-SVE-NEXT:    and z3.s, z3.s, #0x1
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    and z4.s, z4.s, #0x4
+; CHECK-SVE-NEXT:    and z5.s, z5.s, #0x8
+; CHECK-SVE-NEXT:    and z6.s, z6.s, #0x10
+; CHECK-SVE-NEXT:    and z7.s, z7.s, #0x20
+; CHECK-SVE-NEXT:    and z24.s, z24.s, #0x80
+; CHECK-SVE-NEXT:    and z25.s, z25.s, #0x100
+; CHECK-SVE-NEXT:    mul z2.s, p0/m, z2.s, z0.s
+; CHECK-SVE-NEXT:    mul z3.s, p0/m, z3.s, z0.s
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    mul z4.s, p0/m, z4.s, z0.s
+; CHECK-SVE-NEXT:    mul z5.s, p0/m, z5.s, z0.s
+; CHECK-SVE-NEXT:    and z26.s, z26.s, #0x40
+; CHECK-SVE-NEXT:    mul z6.s, p0/m, z6.s, z0.s
+; CHECK-SVE-NEXT:    mul z7.s, p0/m, z7.s, z0.s
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    mul z24.s, p0/m, z24.s, z0.s
+; CHECK-SVE-NEXT:    mul z25.s, p0/m, z25.s, z0.s
+; CHECK-SVE-NEXT:    and z27.s, z27.s, #0x200
+; CHECK-SVE-NEXT:    mul z26.s, p0/m, z26.s, z0.s
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z5.d
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    and z28.s, z28.s, #0x8000
+; CHECK-SVE-NEXT:    mul z27.s, p0/m, z27.s, z0.s
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z7.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    and z3.s, z3.s, #0x400
+; CHECK-SVE-NEXT:    eor z6.d, z24.d, z25.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mul z28.s, p0/m, z28.s, z0.s
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z4.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z26.d
+; CHECK-SVE-NEXT:    and z7.s, z7.s, #0x800
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    mul z3.s, p0/m, z3.s, z0.s
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z27.d
+; CHECK-SVE-NEXT:    and z24.s, z24.s, #0x1000
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    and z25.s, z25.s, #0x800000
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    mul z7.s, p0/m, z7.s, z0.s
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z4.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mul z24.s, p0/m, z24.s, z0.s
+; CHECK-SVE-NEXT:    and z26.s, z26.s, #0x40000
+; CHECK-SVE-NEXT:    and z29.s, z29.s, #0x100000
+; CHECK-SVE-NEXT:    mul z25.s, p0/m, z25.s, z0.s
+; CHECK-SVE-NEXT:    eor z3.d, z5.d, z3.d
+; CHECK-SVE-NEXT:    and z6.s, z6.s, #0x2000
+; CHECK-SVE-NEXT:    and z4.s, z4.s, #0x10000
+; CHECK-SVE-NEXT:    and z27.s, z27.s, #0x1000000
+; CHECK-SVE-NEXT:    mul z26.s, p0/m, z26.s, z0.s
+; CHECK-SVE-NEXT:    mul z29.s, p0/m, z29.s, z0.s
+; CHECK-SVE-NEXT:    mul z6.s, p0/m, z6.s, z0.s
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    mul z4.s, p0/m, z4.s, z0.s
+; CHECK-SVE-NEXT:    mul z27.s, p0/m, z27.s, z0.s
+; CHECK-SVE-NEXT:    eor z5.d, z7.d, z24.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    and z3.s, z3.s, #0x20000
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z6.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    and z24.s, z24.s, #0x400000
+; CHECK-SVE-NEXT:    and z7.s, z7.s, #0x4000
+; CHECK-SVE-NEXT:    mul z3.s, p0/m, z3.s, z0.s
+; CHECK-SVE-NEXT:    and z6.s, z6.s, #0x80000
+; CHECK-SVE-NEXT:    mul z24.s, p0/m, z24.s, z0.s
+; CHECK-SVE-NEXT:    mul z7.s, p0/m, z7.s, z0.s
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z3.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mul z6.s, p0/m, z6.s, z0.s
+; CHECK-SVE-NEXT:    eor z24.d, z24.d, z25.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z7.d
+; CHECK-SVE-NEXT:    and z4.s, z4.s, #0x2000000
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z26.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    and z25.s, z25.s, #0x4000000
+; CHECK-SVE-NEXT:    eor z7.d, z24.d, z27.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mul z4.s, p0/m, z4.s, z0.s
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z6.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    and z26.s, z26.s, #0x200000
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z28.d
+; CHECK-SVE-NEXT:    mul z25.s, p0/m, z25.s, z0.s
+; CHECK-SVE-NEXT:    and z24.s, z24.s, #0x20000000
+; CHECK-SVE-NEXT:    and z6.s, z6.s, #0x8000000
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z29.d
+; CHECK-SVE-NEXT:    eor z4.d, z7.d, z4.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mul z26.s, p0/m, z26.s, z0.s
+; CHECK-SVE-NEXT:    and z27.s, z27.s, #0x40000000
+; CHECK-SVE-NEXT:    mul z24.s, p0/m, z24.s, z0.s
+; CHECK-SVE-NEXT:    and z1.s, z1.s, #0x80000000
+; CHECK-SVE-NEXT:    mul z6.s, p0/m, z6.s, z0.s
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z5.d
+; CHECK-SVE-NEXT:    and z7.s, z7.s, #0x10000000
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z25.d
+; CHECK-SVE-NEXT:    mul z27.s, p0/m, z27.s, z0.s
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z26.d
+; CHECK-SVE-NEXT:    mul z7.s, p0/m, z7.s, z0.s
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z6.d
+; CHECK-SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z3.d, z24.d, z27.d
+; CHECK-SVE-NEXT:    eor z2.d, z4.d, z7.d
+; CHECK-SVE-NEXT:    eor z0.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z2.d
+; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2-LABEL: clmul_nxv4i32:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z2.s, z2.s, #0x2
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x1
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x8
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x4
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x20
+; CHECK-SVE2-NEXT:    mul z2.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x10
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x80
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x40
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x200
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x100
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x800
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x400
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x2000
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x1000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x8000
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x4000
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x20000
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x10000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x80000
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x40000
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x200000
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x100000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x800000
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x400000
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x2000000
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x1000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x8000000
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x4000000
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x20000000
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x10000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0x40000000
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x80000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    mul z0.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-AES-LABEL: clmul_nxv4i32:
+; CHECK-SVE2-AES:       // %bb.0:
+; CHECK-SVE2-AES-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z2.s, z2.s, #0x2
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x1
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x8
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x4
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x20
+; CHECK-SVE2-AES-NEXT:    mul z2.s, z0.s, z2.s
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x10
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x80
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x40
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x200
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x100
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x800
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x400
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x2000
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x1000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x8000
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x4000
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x20000
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x10000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x80000
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x40000
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x200000
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x100000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x800000
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x400000
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x2000000
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x1000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x8000000
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x4000000
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x20000000
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x10000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z1.s, z1.s, #0x40000000
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x80000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    mul z0.s, z0.s, z1.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-AES-NEXT:    ret
   %a = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 2 x i64> @clmul_nxv2i64_sve(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) #0 {
-; CHECK-LABEL: clmul_nxv2i64_sve:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    and z2.d, z2.d, #0x2
-; CHECK-NEXT:    and z3.d, z3.d, #0x1
-; CHECK-NEXT:    and z4.d, z4.d, #0x4
-; CHECK-NEXT:    and z5.d, z5.d, #0x8
-; CHECK-NEXT:    and z6.d, z6.d, #0x10
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    and z7.d, z7.d, #0x20
-; CHECK-NEXT:    and z24.d, z24.d, #0x40
-; CHECK-NEXT:    and z25.d, z25.d, #0x80
-; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z0.d
-; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z0.d
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    mul z4.d, p0/m, z4.d, z0.d
-; CHECK-NEXT:    mul z5.d, p0/m, z5.d, z0.d
-; CHECK-NEXT:    and z26.d, z26.d, #0x100
-; CHECK-NEXT:    mul z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    mul z24.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; CHECK-NEXT:    and z27.d, z27.d, #0x20000
-; CHECK-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    mov z29.d, z1.d
-; CHECK-NEXT:    eor z3.d, z4.d, z5.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    and z28.d, z28.d, #0x400000
-; CHECK-NEXT:    mul z27.d, p0/m, z27.d, z0.d
-; CHECK-NEXT:    eor z4.d, z6.d, z7.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    and z29.d, z29.d, #0x800000
-; CHECK-NEXT:    mov z30.d, z1.d
-; CHECK-NEXT:    mul z28.d, p0/m, z28.d, z0.d
-; CHECK-NEXT:    and z5.d, z5.d, #0x200
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    and z6.d, z6.d, #0x800
-; CHECK-NEXT:    eor z3.d, z4.d, z24.d
-; CHECK-NEXT:    eor z4.d, z25.d, z26.d
-; CHECK-NEXT:    mul z29.d, p0/m, z29.d, z0.d
-; CHECK-NEXT:    and z7.d, z7.d, #0x1000
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    mul z5.d, p0/m, z5.d, z0.d
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mul z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    and z30.d, z30.d, #0x800000000
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; CHECK-NEXT:    and z25.d, z25.d, #0x2000
-; CHECK-NEXT:    and z26.d, z26.d, #0x10000
-; CHECK-NEXT:    and z24.d, z24.d, #0x400
-; CHECK-NEXT:    mul z30.d, p0/m, z30.d, z0.d
-; CHECK-NEXT:    eor z3.d, z4.d, z5.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; CHECK-NEXT:    mul z24.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    eor z6.d, z6.d, z7.d
-; CHECK-NEXT:    and z5.d, z5.d, #0x40000
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x4000
-; CHECK-NEXT:    eor z6.d, z6.d, z25.d
-; CHECK-NEXT:    mul z5.d, p0/m, z5.d, z0.d
-; CHECK-NEXT:    eor z25.d, z26.d, z27.d
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    mul z4.d, p0/m, z4.d, z0.d
-; CHECK-NEXT:    and z7.d, z7.d, #0x80000
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    eor z3.d, z3.d, z24.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    and z27.d, z27.d, #0x1000000
-; CHECK-NEXT:    eor z5.d, z25.d, z5.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; CHECK-NEXT:    and z26.d, z26.d, #0x8000
-; CHECK-NEXT:    eor z4.d, z6.d, z4.d
-; CHECK-NEXT:    and z24.d, z24.d, #0x2000000
-; CHECK-NEXT:    mul z27.d, p0/m, z27.d, z0.d
-; CHECK-NEXT:    eor z6.d, z28.d, z29.d
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    and z25.d, z25.d, #0x100000
-; CHECK-NEXT:    mov z29.d, z1.d
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; CHECK-NEXT:    mul z24.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    eor z5.d, z5.d, z7.d
-; CHECK-NEXT:    and z28.d, z28.d, #0x4000000
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; CHECK-NEXT:    eor z6.d, z6.d, z27.d
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    and z29.d, z29.d, #0x40000000
-; CHECK-NEXT:    mul z28.d, p0/m, z28.d, z0.d
-; CHECK-NEXT:    eor z3.d, z4.d, z26.d
-; CHECK-NEXT:    and z7.d, z7.d, #0x200000
-; CHECK-NEXT:    and z27.d, z27.d, #0x20000000
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    eor z4.d, z5.d, z25.d
-; CHECK-NEXT:    eor z5.d, z6.d, z24.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    mul z29.d, p0/m, z29.d, z0.d
-; CHECK-NEXT:    eor z3.d, z2.d, z3.d
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; CHECK-NEXT:    mul z27.d, p0/m, z27.d, z0.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    and z26.d, z26.d, #0x4000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x80000000
-; CHECK-NEXT:    eor z2.d, z5.d, z28.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    eor z4.d, z4.d, z7.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    and z25.d, z25.d, #0x2000000000
-; CHECK-NEXT:    mul z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT:    and z5.d, z5.d, #0x100000000
-; CHECK-NEXT:    eor z24.d, z27.d, z29.d
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; CHECK-NEXT:    and z28.d, z28.d, #0x8000000000
-; CHECK-NEXT:    and z7.d, z7.d, #0x8000000
-; CHECK-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; CHECK-NEXT:    mov z29.d, z1.d
-; CHECK-NEXT:    mul z5.d, p0/m, z5.d, z0.d
-; CHECK-NEXT:    eor z3.d, z3.d, z4.d
-; CHECK-NEXT:    eor z6.d, z24.d, z6.d
-; CHECK-NEXT:    and z27.d, z27.d, #0x200000000
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; CHECK-NEXT:    mul z28.d, p0/m, z28.d, z0.d
-; CHECK-NEXT:    and z29.d, z29.d, #0x10000000
-; CHECK-NEXT:    eor z25.d, z25.d, z26.d
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    eor z5.d, z6.d, z5.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    mul z27.d, p0/m, z27.d, z0.d
-; CHECK-NEXT:    and z24.d, z24.d, #0x400000000
-; CHECK-NEXT:    mul z29.d, p0/m, z29.d, z0.d
-; CHECK-NEXT:    and z26.d, z26.d, #0x20000000000
-; CHECK-NEXT:    eor z2.d, z2.d, z7.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    and z6.d, z6.d, #0x10000000000
-; CHECK-NEXT:    eor z4.d, z25.d, z28.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    mul z24.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    eor z5.d, z5.d, z27.d
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; CHECK-NEXT:    and z7.d, z7.d, #0x40000000000
-; CHECK-NEXT:    eor z2.d, z2.d, z29.d
-; CHECK-NEXT:    mul z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT:    and z25.d, z25.d, #0x400000000000
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    and z27.d, z27.d, #0x800000000000
-; CHECK-NEXT:    mov z29.d, z1.d
-; CHECK-NEXT:    eor z5.d, z5.d, z24.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; CHECK-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    and z28.d, z28.d, #0x400000000000000
-; CHECK-NEXT:    eor z4.d, z4.d, z6.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    mul z27.d, p0/m, z27.d, z0.d
-; CHECK-NEXT:    and z24.d, z24.d, #0x1000000000
-; CHECK-NEXT:    eor z5.d, z5.d, z30.d
-; CHECK-NEXT:    mov z30.d, z1.d
-; CHECK-NEXT:    mul z28.d, p0/m, z28.d, z0.d
-; CHECK-NEXT:    and z29.d, z29.d, #0x200000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x80000000000
-; CHECK-NEXT:    eor z4.d, z4.d, z26.d
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    mul z24.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    and z30.d, z30.d, #0x800000000000000
-; CHECK-NEXT:    mul z29.d, p0/m, z29.d, z0.d
-; CHECK-NEXT:    mul z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT:    and z26.d, z26.d, #0x1000000000000
-; CHECK-NEXT:    eor z4.d, z4.d, z7.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    mul z30.d, p0/m, z30.d, z0.d
-; CHECK-NEXT:    eor z3.d, z5.d, z24.d
-; CHECK-NEXT:    eor z5.d, z25.d, z27.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    mov z27.d, z1.d
-; CHECK-NEXT:    and z7.d, z7.d, #0x2000000000000
-; CHECK-NEXT:    eor z4.d, z4.d, z6.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z24.d, z24.d, #0x100000000000000
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
-; CHECK-NEXT:    and z25.d, z25.d, #0x200000000000000
-; CHECK-NEXT:    and z27.d, z27.d, #0x100000000000
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; CHECK-NEXT:    and z6.d, z6.d, #0x4000000000000
-; CHECK-NEXT:    eor z5.d, z5.d, z26.d
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    mul z24.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; CHECK-NEXT:    mul z27.d, p0/m, z27.d, z0.d
-; CHECK-NEXT:    mul z6.d, p0/m, z6.d, z0.d
-; CHECK-NEXT:    and z26.d, z26.d, #0x8000000000000
-; CHECK-NEXT:    eor z5.d, z5.d, z7.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    eor z4.d, z4.d, z27.d
-; CHECK-NEXT:    and z7.d, z7.d, #0x10000000000000
-; CHECK-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; CHECK-NEXT:    eor z5.d, z5.d, z6.d
-; CHECK-NEXT:    eor z6.d, z24.d, z25.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    mov z25.d, z1.d
-; CHECK-NEXT:    eor z3.d, z4.d, z29.d
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; CHECK-NEXT:    and z24.d, z24.d, #0x20000000000000
-; CHECK-NEXT:    and z25.d, z25.d, #0x1000000000000000
-; CHECK-NEXT:    eor z5.d, z5.d, z26.d
-; CHECK-NEXT:    mov z26.d, z1.d
-; CHECK-NEXT:    eor z6.d, z6.d, z28.d
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    mul z24.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    mul z25.d, p0/m, z25.d, z0.d
-; CHECK-NEXT:    eor z5.d, z5.d, z7.d
-; CHECK-NEXT:    and z26.d, z26.d, #0x40000000000000
-; CHECK-NEXT:    and z28.d, z28.d, #0x2000000000000000
-; CHECK-NEXT:    eor z6.d, z6.d, z30.d
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    mov z30.d, z1.d
-; CHECK-NEXT:    and z1.d, z1.d, #0x8000000000000000
-; CHECK-NEXT:    mul z26.d, p0/m, z26.d, z0.d
-; CHECK-NEXT:    mul z28.d, p0/m, z28.d, z0.d
-; CHECK-NEXT:    eor z5.d, z5.d, z24.d
-; CHECK-NEXT:    and z7.d, z7.d, #0x80000000000000
-; CHECK-NEXT:    and z30.d, z30.d, #0x4000000000000000
-; CHECK-NEXT:    eor z6.d, z6.d, z25.d
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z0.d
-; CHECK-NEXT:    mul z30.d, p0/m, z30.d, z0.d
-; CHECK-NEXT:    eor z4.d, z5.d, z26.d
-; CHECK-NEXT:    eor z5.d, z6.d, z28.d
-; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    eor z1.d, z2.d, z3.d
-; CHECK-NEXT:    eor z2.d, z4.d, z7.d
-; CHECK-NEXT:    eor z3.d, z5.d, z30.d
-; CHECK-NEXT:    eor z1.d, z1.d, z2.d
-; CHECK-NEXT:    eor z0.d, z3.d, z0.d
-; CHECK-NEXT:    eor z0.d, z1.d, z0.d
-; CHECK-NEXT:    ret
+define <vscale x 2 x i64> @clmul_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-SVE-LABEL: clmul_nxv2i64:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov z2.d, z1.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    ptrue p0.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    and z2.d, z2.d, #0x2
+; CHECK-SVE-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-SVE-NEXT:    and z4.d, z4.d, #0x4
+; CHECK-SVE-NEXT:    and z5.d, z5.d, #0x8
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x10
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x20
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x40
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x80
+; CHECK-SVE-NEXT:    mul z2.d, p0/m, z2.d, z0.d
+; CHECK-SVE-NEXT:    mul z3.d, p0/m, z3.d, z0.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    mul z4.d, p0/m, z4.d, z0.d
+; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z0.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x100
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x20000
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x400000
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z0.d
+; CHECK-SVE-NEXT:    eor z4.d, z6.d, z7.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    and z29.d, z29.d, #0x800000
+; CHECK-SVE-NEXT:    mov z30.d, z1.d
+; CHECK-SVE-NEXT:    mul z28.d, p0/m, z28.d, z0.d
+; CHECK-SVE-NEXT:    and z5.d, z5.d, #0x200
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x800
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z24.d
+; CHECK-SVE-NEXT:    eor z4.d, z25.d, z26.d
+; CHECK-SVE-NEXT:    mul z29.d, p0/m, z29.d, z0.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x1000
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z0.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    and z30.d, z30.d, #0x800000000
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x2000
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x10000
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x400
+; CHECK-SVE-NEXT:    mul z30.d, p0/m, z30.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    eor z6.d, z6.d, z7.d
+; CHECK-SVE-NEXT:    and z5.d, z5.d, #0x40000
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    and z4.d, z4.d, #0x4000
+; CHECK-SVE-NEXT:    eor z6.d, z6.d, z25.d
+; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z0.d
+; CHECK-SVE-NEXT:    eor z25.d, z26.d, z27.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    mul z4.d, p0/m, z4.d, z0.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x80000
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z24.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x1000000
+; CHECK-SVE-NEXT:    eor z5.d, z25.d, z5.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x8000
+; CHECK-SVE-NEXT:    eor z4.d, z6.d, z4.d
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x2000000
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z0.d
+; CHECK-SVE-NEXT:    eor z6.d, z28.d, z29.d
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x100000
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z7.d
+; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x4000000
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    eor z6.d, z6.d, z27.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    and z29.d, z29.d, #0x40000000
+; CHECK-SVE-NEXT:    mul z28.d, p0/m, z28.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z26.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x200000
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x20000000
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z25.d
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z24.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mul z29.d, p0/m, z29.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z0.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x4000000000
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x80000000
+; CHECK-SVE-NEXT:    eor z2.d, z5.d, z28.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z7.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x2000000000
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    and z5.d, z5.d, #0x100000000
+; CHECK-SVE-NEXT:    eor z24.d, z27.d, z29.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x8000000000
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x8000000
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
+; CHECK-SVE-NEXT:    eor z6.d, z24.d, z6.d
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x200000000
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    mul z28.d, p0/m, z28.d, z0.d
+; CHECK-SVE-NEXT:    and z29.d, z29.d, #0x10000000
+; CHECK-SVE-NEXT:    eor z25.d, z25.d, z26.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z5.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z0.d
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x400000000
+; CHECK-SVE-NEXT:    mul z29.d, p0/m, z29.d, z0.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x20000000000
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z7.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x10000000000
+; CHECK-SVE-NEXT:    eor z4.d, z25.d, z28.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z27.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x40000000000
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z29.d
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x400000000000
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x800000000000
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z24.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x400000000000000
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z6.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z0.d
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x1000000000
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z30.d
+; CHECK-SVE-NEXT:    mov z30.d, z1.d
+; CHECK-SVE-NEXT:    mul z28.d, p0/m, z28.d, z0.d
+; CHECK-SVE-NEXT:    and z29.d, z29.d, #0x200000000000
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x80000000000
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z26.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    and z30.d, z30.d, #0x800000000000000
+; CHECK-SVE-NEXT:    mul z29.d, p0/m, z29.d, z0.d
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x1000000000000
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z7.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mul z30.d, p0/m, z30.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z5.d, z24.d
+; CHECK-SVE-NEXT:    eor z5.d, z25.d, z27.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x2000000000000
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z6.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x100000000000000
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x200000000000000
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x100000000000
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x4000000000000
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z26.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z0.d
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z0.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x8000000000000
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z7.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z27.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x10000000000000
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z6.d
+; CHECK-SVE-NEXT:    eor z6.d, z24.d, z25.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z29.d
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    and z24.d, z24.d, #0x20000000000000
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x1000000000000000
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z26.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    eor z6.d, z6.d, z28.d
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z0.d
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z7.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x40000000000000
+; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x2000000000000000
+; CHECK-SVE-NEXT:    eor z6.d, z6.d, z30.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mov z30.d, z1.d
+; CHECK-SVE-NEXT:    and z1.d, z1.d, #0x8000000000000000
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z0.d
+; CHECK-SVE-NEXT:    mul z28.d, p0/m, z28.d, z0.d
+; CHECK-SVE-NEXT:    eor z5.d, z5.d, z24.d
+; CHECK-SVE-NEXT:    and z7.d, z7.d, #0x80000000000000
+; CHECK-SVE-NEXT:    and z30.d, z30.d, #0x4000000000000000
+; CHECK-SVE-NEXT:    eor z6.d, z6.d, z25.d
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z0.d
+; CHECK-SVE-NEXT:    mul z30.d, p0/m, z30.d, z0.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z26.d
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z28.d
+; CHECK-SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z2.d, z4.d, z7.d
+; CHECK-SVE-NEXT:    eor z3.d, z5.d, z30.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z2.d
+; CHECK-SVE-NEXT:    eor z0.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2-LABEL: clmul_nxv2i64:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z2.d, z2.d, #0x2
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x8
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x4
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x20
+; CHECK-SVE2-NEXT:    mul z2.d, z0.d, z2.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x10
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x80
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x40
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x200
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x100
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x800
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x400
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x2000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x1000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x8000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x4000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x20000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x10000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x80000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x40000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x200000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x100000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x800000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x400000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x2000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x1000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x8000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x4000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x20000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x10000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x80000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x40000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x200000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x100000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x800000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x400000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x2000000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x1000000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x8000000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x4000000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x20000000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x10000000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x80000000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x40000000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x200000000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x100000000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x800000000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x400000000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x2000000000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x1000000000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x8000000000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x4000000000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x20000000000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x10000000000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x80000000000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x40000000000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x200000000000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x100000000000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x800000000000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x400000000000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x2000000000000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x1000000000000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    and z1.d, z1.d, #0x4000000000000000
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x8000000000000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    mul z0.d, z0.d, z1.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-AES-LABEL: clmul_nxv2i64:
+; CHECK-SVE2-AES:       // %bb.0:
+; CHECK-SVE2-AES-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z2.d, z2.d, #0x2
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x8
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x4
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x20
+; CHECK-SVE2-AES-NEXT:    mul z2.d, z0.d, z2.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x10
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x80
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x40
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x200
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x100
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x800
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x400
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x2000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x1000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x8000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x4000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x20000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x10000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x80000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x40000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x200000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x100000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x800000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x400000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x2000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x1000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x8000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x4000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x20000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x10000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x80000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x40000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x200000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x100000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x800000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x400000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x2000000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x1000000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x8000000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x4000000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x20000000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x10000000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x80000000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x40000000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x200000000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x100000000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x800000000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x400000000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x2000000000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x1000000000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x8000000000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x4000000000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x20000000000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x10000000000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x80000000000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x40000000000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x200000000000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x100000000000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x800000000000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x400000000000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x2000000000000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x1000000000000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z1.d, z1.d, #0x4000000000000000
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x8000000000000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    mul z0.d, z0.d, z1.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-AES-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 16 x i8> @clmul_nxv16i8_sve2(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) #1 {
-; CHECK-LABEL: clmul_nxv16i8_sve2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    pmul z0.b, z0.b, z1.b
-; CHECK-NEXT:    ret
-  %a = call <vscale x 16 x i8> @llvm.clmul.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
+; TODO: fix
+; define <vscale x 1 x i128> @clmul_nxv1i128(<vscale x 1 x i128> %x, <vscale x 1 x i128> %y) {
+;   %a = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %x, <vscale x 1 x i128> %y)
+;   ret <vscale x 1 x i128> %a
+; }
+
+define <vscale x 16 x i8> @clmul_nxv16i8_zext(<vscale x 16 x i4> %x, <vscale x 16 x i4> %y) {
+; CHECK-SVE-LABEL: clmul_nxv16i8_zext:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov z2.d, z1.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    and z0.b, z0.b, #0xf
+; CHECK-SVE-NEXT:    and z1.b, z1.b, #0x8
+; CHECK-SVE-NEXT:    ptrue p0.b
+; CHECK-SVE-NEXT:    and z2.b, z2.b, #0x2
+; CHECK-SVE-NEXT:    and z3.b, z3.b, #0x1
+; CHECK-SVE-NEXT:    and z4.b, z4.b, #0x4
+; CHECK-SVE-NEXT:    mul z1.b, p0/m, z1.b, z0.b
+; CHECK-SVE-NEXT:    mul z2.b, p0/m, z2.b, z0.b
+; CHECK-SVE-NEXT:    mul z3.b, p0/m, z3.b, z0.b
+; CHECK-SVE-NEXT:    mul z4.b, p0/m, z4.b, z0.b
+; CHECK-SVE-NEXT:    mul z0.b, z0.b, #0
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    eor z1.d, z4.d, z1.d
+; CHECK-SVE-NEXT:    eor z3.d, z0.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z1.d
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z2.d
+; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2-LABEL: clmul_nxv16i8_zext:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    and z0.b, z0.b, #0xf
+; CHECK-SVE2-NEXT:    and z1.b, z1.b, #0xf
+; CHECK-SVE2-NEXT:    pmul z0.b, z0.b, z1.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-AES-LABEL: clmul_nxv16i8_zext:
+; CHECK-SVE2-AES:       // %bb.0:
+; CHECK-SVE2-AES-NEXT:    and z0.b, z0.b, #0xf
+; CHECK-SVE2-AES-NEXT:    and z1.b, z1.b, #0xf
+; CHECK-SVE2-AES-NEXT:    pmul z0.b, z0.b, z1.b
+; CHECK-SVE2-AES-NEXT:    ret
+  %zextx = zext <vscale x 16 x i4> %x to <vscale x 16 x i8>
+  %zexty = zext <vscale x 16 x i4> %y to <vscale x 16 x i8>
+  %a = call <vscale x 16 x i8> @llvm.clmul.nxv16i8(<vscale x 16 x i8> %zextx, <vscale x 16 x i8> %zexty)
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 8 x i16> @clmul_nxv8i16_sve2(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) #1 {
-; CHECK-LABEL: clmul_nxv8i16_sve2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z2.h, z2.h, #0x2
-; CHECK-NEXT:    and z3.h, z3.h, #0x1
-; CHECK-NEXT:    and z4.h, z4.h, #0x8
-; CHECK-NEXT:    and z5.h, z5.h, #0x4
-; CHECK-NEXT:    and z6.h, z6.h, #0x20
-; CHECK-NEXT:    mul z2.h, z0.h, z2.h
-; CHECK-NEXT:    mul z3.h, z0.h, z3.h
-; CHECK-NEXT:    mul z4.h, z0.h, z4.h
-; CHECK-NEXT:    mul z5.h, z0.h, z5.h
-; CHECK-NEXT:    mul z6.h, z0.h, z6.h
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    and z3.h, z3.h, #0x10
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.h, z0.h, z3.h
-; CHECK-NEXT:    and z4.h, z4.h, #0x80
-; CHECK-NEXT:    and z5.h, z5.h, #0x40
-; CHECK-NEXT:    mul z4.h, z0.h, z4.h
-; CHECK-NEXT:    mul z5.h, z0.h, z5.h
-; CHECK-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.h, z3.h, #0x200
-; CHECK-NEXT:    and z6.h, z6.h, #0x100
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.h, z0.h, z3.h
-; CHECK-NEXT:    mul z6.h, z0.h, z6.h
-; CHECK-NEXT:    and z4.h, z4.h, #0x800
-; CHECK-NEXT:    and z5.h, z5.h, #0x400
-; CHECK-NEXT:    mul z4.h, z0.h, z4.h
-; CHECK-NEXT:    mul z5.h, z0.h, z5.h
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.h, z3.h, #0x2000
-; CHECK-NEXT:    and z6.h, z6.h, #0x1000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    and z1.h, z1.h, #0x4000
-; CHECK-NEXT:    mul z3.h, z0.h, z3.h
-; CHECK-NEXT:    mul z4.h, z0.h, z6.h
-; CHECK-NEXT:    and z5.h, z5.h, #0x8000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
-; CHECK-NEXT:    mul z3.h, z0.h, z5.h
-; CHECK-NEXT:    mul z0.h, z0.h, z1.h
-; CHECK-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %a = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
+define <vscale x 8 x i16> @clmul_nxv8i16_zext(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) {
+; CHECK-SVE-LABEL: clmul_nxv8i16_zext:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov z2.d, z1.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    and z0.h, z0.h, #0xff
+; CHECK-SVE-NEXT:    and z1.h, z1.h, #0x80
+; CHECK-SVE-NEXT:    and z2.h, z2.h, #0x2
+; CHECK-SVE-NEXT:    and z3.h, z3.h, #0x1
+; CHECK-SVE-NEXT:    and z4.h, z4.h, #0x4
+; CHECK-SVE-NEXT:    and z5.h, z5.h, #0x8
+; CHECK-SVE-NEXT:    and z6.h, z6.h, #0x10
+; CHECK-SVE-NEXT:    and z7.h, z7.h, #0x20
+; CHECK-SVE-NEXT:    and z24.h, z24.h, #0x40
+; CHECK-SVE-NEXT:    ptrue p0.h
+; CHECK-SVE-NEXT:    mul z2.h, p0/m, z2.h, z0.h
+; CHECK-SVE-NEXT:    mul z3.h, p0/m, z3.h, z0.h
+; CHECK-SVE-NEXT:    mul z4.h, p0/m, z4.h, z0.h
+; CHECK-SVE-NEXT:    mul z5.h, p0/m, z5.h, z0.h
+; CHECK-SVE-NEXT:    mul z6.h, p0/m, z6.h, z0.h
+; CHECK-SVE-NEXT:    mul z7.h, p0/m, z7.h, z0.h
+; CHECK-SVE-NEXT:    mul z24.h, p0/m, z24.h, z0.h
+; CHECK-SVE-NEXT:    mul z1.h, p0/m, z1.h, z0.h
+; CHECK-SVE-NEXT:    mul z0.h, z0.h, #0
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
+; CHECK-SVE-NEXT:    eor z4.d, z6.d, z7.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z5.d, z0.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z24.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z1.d
+; CHECK-SVE-NEXT:    eor z0.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2-LABEL: clmul_nxv8i16_zext:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    and z0.h, z0.h, #0xff
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0x2
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0x1
+; CHECK-SVE2-NEXT:    and z4.h, z4.h, #0x8
+; CHECK-SVE2-NEXT:    and z5.h, z5.h, #0x4
+; CHECK-SVE2-NEXT:    and z6.h, z6.h, #0x20
+; CHECK-SVE2-NEXT:    mul z2.h, z0.h, z2.h
+; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-NEXT:    mul z4.h, z0.h, z4.h
+; CHECK-SVE2-NEXT:    mul z5.h, z0.h, z5.h
+; CHECK-SVE2-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0x10
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z4.h, z0.h, z6.h
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0x40
+; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-NEXT:    and z5.h, z5.h, #0x80
+; CHECK-SVE2-NEXT:    mul z1.h, z0.h, z1.h
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mul z3.h, z0.h, z5.h
+; CHECK-SVE2-NEXT:    mul z0.h, z0.h, #0
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-AES-LABEL: clmul_nxv8i16_zext:
+; CHECK-SVE2-AES:       // %bb.0:
+; CHECK-SVE2-AES-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z0.h, z0.h, #0xff
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z2.h, z2.h, #0x2
+; CHECK-SVE2-AES-NEXT:    and z3.h, z3.h, #0x1
+; CHECK-SVE2-AES-NEXT:    and z4.h, z4.h, #0x8
+; CHECK-SVE2-AES-NEXT:    and z5.h, z5.h, #0x4
+; CHECK-SVE2-AES-NEXT:    and z6.h, z6.h, #0x20
+; CHECK-SVE2-AES-NEXT:    mul z2.h, z0.h, z2.h
+; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-AES-NEXT:    mul z4.h, z0.h, z4.h
+; CHECK-SVE2-AES-NEXT:    mul z5.h, z0.h, z5.h
+; CHECK-SVE2-AES-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.h, z3.h, #0x10
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z4.h, z0.h, z6.h
+; CHECK-SVE2-AES-NEXT:    and z1.h, z1.h, #0x40
+; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z3.h
+; CHECK-SVE2-AES-NEXT:    and z5.h, z5.h, #0x80
+; CHECK-SVE2-AES-NEXT:    mul z1.h, z0.h, z1.h
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z3.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z3.h, z0.h, z5.h
+; CHECK-SVE2-AES-NEXT:    mul z0.h, z0.h, #0
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-AES-NEXT:    ret
+  %zextx = zext <vscale x 8 x i8> %x to <vscale x 8 x i16>
+  %zexty = zext <vscale x 8 x i8> %y to <vscale x 8 x i16>
+  %a = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %zextx, <vscale x 8 x i16> %zexty)
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 4 x i32> @clmul_nxv4i32_sve2(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) #1 {
-; CHECK-LABEL: clmul_nxv4i32_sve2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z2.s, z2.s, #0x2
-; CHECK-NEXT:    and z3.s, z3.s, #0x1
-; CHECK-NEXT:    and z4.s, z4.s, #0x8
-; CHECK-NEXT:    and z5.s, z5.s, #0x4
-; CHECK-NEXT:    and z6.s, z6.s, #0x20
-; CHECK-NEXT:    mul z2.s, z0.s, z2.s
-; CHECK-NEXT:    mul z3.s, z0.s, z3.s
-; CHECK-NEXT:    mul z4.s, z0.s, z4.s
-; CHECK-NEXT:    mul z5.s, z0.s, z5.s
-; CHECK-NEXT:    mul z6.s, z0.s, z6.s
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    and z3.s, z3.s, #0x10
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.s, z0.s, z3.s
-; CHECK-NEXT:    and z4.s, z4.s, #0x80
-; CHECK-NEXT:    and z5.s, z5.s, #0x40
-; CHECK-NEXT:    mul z4.s, z0.s, z4.s
-; CHECK-NEXT:    mul z5.s, z0.s, z5.s
-; CHECK-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.s, z3.s, #0x200
-; CHECK-NEXT:    and z6.s, z6.s, #0x100
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.s, z0.s, z3.s
-; CHECK-NEXT:    mul z6.s, z0.s, z6.s
-; CHECK-NEXT:    and z4.s, z4.s, #0x800
-; CHECK-NEXT:    and z5.s, z5.s, #0x400
-; CHECK-NEXT:    mul z4.s, z0.s, z4.s
-; CHECK-NEXT:    mul z5.s, z0.s, z5.s
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.s, z3.s, #0x2000
-; CHECK-NEXT:    and z6.s, z6.s, #0x1000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.s, z0.s, z3.s
-; CHECK-NEXT:    mul z6.s, z0.s, z6.s
-; CHECK-NEXT:    and z4.s, z4.s, #0x8000
-; CHECK-NEXT:    and z5.s, z5.s, #0x4000
-; CHECK-NEXT:    mul z4.s, z0.s, z4.s
-; CHECK-NEXT:    mul z5.s, z0.s, z5.s
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.s, z3.s, #0x20000
-; CHECK-NEXT:    and z6.s, z6.s, #0x10000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.s, z0.s, z3.s
-; CHECK-NEXT:    mul z6.s, z0.s, z6.s
-; CHECK-NEXT:    and z4.s, z4.s, #0x80000
-; CHECK-NEXT:    and z5.s, z5.s, #0x40000
-; CHECK-NEXT:    mul z4.s, z0.s, z4.s
-; CHECK-NEXT:    mul z5.s, z0.s, z5.s
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.s, z3.s, #0x200000
-; CHECK-NEXT:    and z6.s, z6.s, #0x100000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.s, z0.s, z3.s
-; CHECK-NEXT:    mul z6.s, z0.s, z6.s
-; CHECK-NEXT:    and z4.s, z4.s, #0x800000
-; CHECK-NEXT:    and z5.s, z5.s, #0x400000
-; CHECK-NEXT:    mul z4.s, z0.s, z4.s
-; CHECK-NEXT:    mul z5.s, z0.s, z5.s
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.s, z3.s, #0x2000000
-; CHECK-NEXT:    and z6.s, z6.s, #0x1000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.s, z0.s, z3.s
-; CHECK-NEXT:    mul z6.s, z0.s, z6.s
-; CHECK-NEXT:    and z4.s, z4.s, #0x8000000
-; CHECK-NEXT:    and z5.s, z5.s, #0x4000000
-; CHECK-NEXT:    mul z4.s, z0.s, z4.s
-; CHECK-NEXT:    mul z5.s, z0.s, z5.s
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.s, z3.s, #0x20000000
-; CHECK-NEXT:    and z6.s, z6.s, #0x10000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    and z1.s, z1.s, #0x40000000
-; CHECK-NEXT:    mul z3.s, z0.s, z3.s
-; CHECK-NEXT:    mul z4.s, z0.s, z6.s
-; CHECK-NEXT:    and z5.s, z5.s, #0x80000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
-; CHECK-NEXT:    mul z3.s, z0.s, z5.s
-; CHECK-NEXT:    mul z0.s, z0.s, z1.s
-; CHECK-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %a = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
+define <vscale x 4 x i32> @clmul_nxv4i32_zext(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
+; CHECK-SVE-LABEL: clmul_nxv4i32_zext:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE-NEXT:    addvl sp, sp, #-1
+; CHECK-SVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-SVE-NEXT:    .cfi_offset w29, -16
+; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
+; CHECK-SVE-NEXT:    mov z2.d, z1.d
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mov z7.d, z1.d
+; CHECK-SVE-NEXT:    mov z24.d, z1.d
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    mov z30.d, z1.d
+; CHECK-SVE-NEXT:    mov z31.d, z1.d
+; CHECK-SVE-NEXT:    mov z8.d, z1.d
+; CHECK-SVE-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-SVE-NEXT:    and z2.s, z2.s, #0x2
+; CHECK-SVE-NEXT:    and z3.s, z3.s, #0x1
+; CHECK-SVE-NEXT:    and z4.s, z4.s, #0x4
+; CHECK-SVE-NEXT:    and z5.s, z5.s, #0x8
+; CHECK-SVE-NEXT:    and z6.s, z6.s, #0x10
+; CHECK-SVE-NEXT:    and z7.s, z7.s, #0x20
+; CHECK-SVE-NEXT:    and z24.s, z24.s, #0x40
+; CHECK-SVE-NEXT:    and z25.s, z25.s, #0x80
+; CHECK-SVE-NEXT:    and z26.s, z26.s, #0x100
+; CHECK-SVE-NEXT:    and z27.s, z27.s, #0x200
+; CHECK-SVE-NEXT:    and z28.s, z28.s, #0x400
+; CHECK-SVE-NEXT:    and z29.s, z29.s, #0x800
+; CHECK-SVE-NEXT:    and z30.s, z30.s, #0x1000
+; CHECK-SVE-NEXT:    and z31.s, z31.s, #0x2000
+; CHECK-SVE-NEXT:    and z8.s, z8.s, #0x4000
+; CHECK-SVE-NEXT:    and z1.s, z1.s, #0x8000
+; CHECK-SVE-NEXT:    ptrue p0.s
+; CHECK-SVE-NEXT:    mul z2.s, p0/m, z2.s, z0.s
+; CHECK-SVE-NEXT:    mul z3.s, p0/m, z3.s, z0.s
+; CHECK-SVE-NEXT:    mul z4.s, p0/m, z4.s, z0.s
+; CHECK-SVE-NEXT:    mul z5.s, p0/m, z5.s, z0.s
+; CHECK-SVE-NEXT:    mul z6.s, p0/m, z6.s, z0.s
+; CHECK-SVE-NEXT:    mul z7.s, p0/m, z7.s, z0.s
+; CHECK-SVE-NEXT:    mul z24.s, p0/m, z24.s, z0.s
+; CHECK-SVE-NEXT:    mul z25.s, p0/m, z25.s, z0.s
+; CHECK-SVE-NEXT:    mul z26.s, p0/m, z26.s, z0.s
+; CHECK-SVE-NEXT:    mul z27.s, p0/m, z27.s, z0.s
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE-NEXT:    mul z28.s, p0/m, z28.s, z0.s
+; CHECK-SVE-NEXT:    mul z29.s, p0/m, z29.s, z0.s
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z5.d
+; CHECK-SVE-NEXT:    mul z30.s, p0/m, z30.s, z0.s
+; CHECK-SVE-NEXT:    mul z31.s, p0/m, z31.s, z0.s
+; CHECK-SVE-NEXT:    eor z4.d, z6.d, z7.d
+; CHECK-SVE-NEXT:    mul z8.s, p0/m, z8.s, z0.s
+; CHECK-SVE-NEXT:    mul z1.s, p0/m, z1.s, z0.s
+; CHECK-SVE-NEXT:    mul z0.s, z0.s, #0
+; CHECK-SVE-NEXT:    eor z5.d, z25.d, z26.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z24.d
+; CHECK-SVE-NEXT:    eor z6.d, z29.d, z30.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z27.d
+; CHECK-SVE-NEXT:    eor z7.d, z0.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z31.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z28.d
+; CHECK-SVE-NEXT:    eor z6.d, z7.d, z0.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z8.d
+; CHECK-SVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z4.d, z1.d
+; CHECK-SVE-NEXT:    eor z3.d, z5.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z1.d
+; CHECK-SVE-NEXT:    eor z2.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z2.d
+; CHECK-SVE-NEXT:    eor z0.d, z2.d, z0.d
+; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z0.d, z0.d, z6.d
+; CHECK-SVE-NEXT:    addvl sp, sp, #1
+; CHECK-SVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2-LABEL: clmul_nxv4i32_zext:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z2.s, z2.s, #0x2
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x1
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x8
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x4
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x20
+; CHECK-SVE2-NEXT:    mul z2.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x10
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x80
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x40
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x200
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x100
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    and z4.s, z4.s, #0x800
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x400
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.s, z3.s, #0x2000
+; CHECK-SVE2-NEXT:    and z6.s, z6.s, #0x1000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0x4000
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    mul z4.s, z0.s, z6.s
+; CHECK-SVE2-NEXT:    and z5.s, z5.s, #0x8000
+; CHECK-SVE2-NEXT:    mul z1.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-NEXT:    mul z3.s, z0.s, z5.s
+; CHECK-SVE2-NEXT:    mul z0.s, z0.s, #0
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-AES-LABEL: clmul_nxv4i32_zext:
+; CHECK-SVE2-AES:       // %bb.0:
+; CHECK-SVE2-AES-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z2.s, z2.s, #0x2
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x1
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x8
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x4
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x20
+; CHECK-SVE2-AES-NEXT:    mul z2.s, z0.s, z2.s
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x10
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x80
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x40
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x200
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x100
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z6.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    and z4.s, z4.s, #0x800
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x400
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z4.s
+; CHECK-SVE2-AES-NEXT:    mul z5.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.s, z3.s, #0x2000
+; CHECK-SVE2-AES-NEXT:    and z6.s, z6.s, #0x1000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z1.s, z1.s, #0x4000
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z3.s
+; CHECK-SVE2-AES-NEXT:    mul z4.s, z0.s, z6.s
+; CHECK-SVE2-AES-NEXT:    and z5.s, z5.s, #0x8000
+; CHECK-SVE2-AES-NEXT:    mul z1.s, z0.s, z1.s
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z3.s, z0.s, z5.s
+; CHECK-SVE2-AES-NEXT:    mul z0.s, z0.s, #0
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-AES-NEXT:    ret
+  %zextx = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+  %zexty = zext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+  %a = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %zextx, <vscale x 4 x i32> %zexty)
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 2 x i64> @clmul_nxv2i64_sve2(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) #1 {
-; CHECK-LABEL: clmul_nxv2i64_sve2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z2.d, z2.d, #0x2
-; CHECK-NEXT:    and z3.d, z3.d, #0x1
-; CHECK-NEXT:    and z4.d, z4.d, #0x8
-; CHECK-NEXT:    and z5.d, z5.d, #0x4
-; CHECK-NEXT:    and z6.d, z6.d, #0x20
-; CHECK-NEXT:    mul z2.d, z0.d, z2.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x10
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80
-; CHECK-NEXT:    and z5.d, z5.d, #0x40
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200
-; CHECK-NEXT:    and z6.d, z6.d, #0x100
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800
-; CHECK-NEXT:    and z5.d, z5.d, #0x400
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x8000
-; CHECK-NEXT:    and z5.d, z5.d, #0x4000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x20000
-; CHECK-NEXT:    and z6.d, z6.d, #0x10000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80000
-; CHECK-NEXT:    and z5.d, z5.d, #0x40000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200000
-; CHECK-NEXT:    and z6.d, z6.d, #0x100000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800000
-; CHECK-NEXT:    and z5.d, z5.d, #0x400000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x8000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x4000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x20000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x10000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x40000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x100000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x400000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x8000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x4000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x20000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x10000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x40000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x100000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x400000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x8000000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x4000000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x20000000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x10000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80000000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x40000000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200000000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x100000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800000000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x400000000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000000000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    and z1.d, z1.d, #0x4000000000000000
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z4.d, z0.d, z6.d
-; CHECK-NEXT:    and z5.d, z5.d, #0x8000000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
-; CHECK-NEXT:    mul z3.d, z0.d, z5.d
-; CHECK-NEXT:    mul z0.d, z0.d, z1.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %a = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
-  ret <vscale x 2 x i64> %a
-}
-
-; TODO: lower with .d variant of pmullt/b which requires FEAT_SVE_PMULL128
-define <vscale x 2 x i64> @clmul_nxv2i64_sve2_aes(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) #2 {
-; CHECK-LABEL: clmul_nxv2i64_sve2_aes:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z2.d, z2.d, #0x2
-; CHECK-NEXT:    and z3.d, z3.d, #0x1
-; CHECK-NEXT:    and z4.d, z4.d, #0x8
-; CHECK-NEXT:    and z5.d, z5.d, #0x4
-; CHECK-NEXT:    and z6.d, z6.d, #0x20
-; CHECK-NEXT:    mul z2.d, z0.d, z2.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    eor z2.d, z3.d, z2.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x10
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80
-; CHECK-NEXT:    and z5.d, z5.d, #0x40
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200
-; CHECK-NEXT:    and z6.d, z6.d, #0x100
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800
-; CHECK-NEXT:    and z5.d, z5.d, #0x400
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x8000
-; CHECK-NEXT:    and z5.d, z5.d, #0x4000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x20000
-; CHECK-NEXT:    and z6.d, z6.d, #0x10000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80000
-; CHECK-NEXT:    and z5.d, z5.d, #0x40000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200000
-; CHECK-NEXT:    and z6.d, z6.d, #0x100000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800000
-; CHECK-NEXT:    and z5.d, z5.d, #0x400000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x8000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x4000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x20000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x10000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x40000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x100000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x400000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x8000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x4000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x20000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x10000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x40000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x100000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x400000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x8000000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x4000000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x20000000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x10000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x80000000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x40000000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x200000000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x100000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z6.d, z0.d, z6.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x800000000000000
-; CHECK-NEXT:    and z5.d, z5.d, #0x400000000000000
-; CHECK-NEXT:    mul z4.d, z0.d, z4.d
-; CHECK-NEXT:    mul z5.d, z0.d, z5.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    and z3.d, z3.d, #0x2000000000000000
-; CHECK-NEXT:    and z6.d, z6.d, #0x1000000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    and z1.d, z1.d, #0x4000000000000000
-; CHECK-NEXT:    mul z3.d, z0.d, z3.d
-; CHECK-NEXT:    mul z4.d, z0.d, z6.d
-; CHECK-NEXT:    and z5.d, z5.d, #0x8000000000000000
-; CHECK-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
-; CHECK-NEXT:    mul z3.d, z0.d, z5.d
-; CHECK-NEXT:    mul z0.d, z0.d, z1.d
-; CHECK-NEXT:    eor3 z2.d, z2.d, z0.d, z3.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
-  %a = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
+define <vscale x 2 x i64> @clmul_nxv2i64_zext(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) {
+; CHECK-SVE-LABEL: clmul_nxv2i64_zext:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SVE-NEXT:    addvl sp, sp, #-16
+; CHECK-SVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    addvl sp, sp, #-3
+; CHECK-SVE-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x98, 0x01, 0x1e, 0x22 // sp + 16 + 152 * VG
+; CHECK-SVE-NEXT:    .cfi_offset w29, -16
+; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
+; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16
+; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x40, 0x1c // $d10 @ cfa - 24 * VG - 16
+; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x40, 0x1c // $d11 @ cfa - 32 * VG - 16
+; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x40, 0x1c // $d12 @ cfa - 40 * VG - 16
+; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x40, 0x1c // $d13 @ cfa - 48 * VG - 16
+; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
+; CHECK-SVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
+; CHECK-SVE-NEXT:    mov z6.d, z1.d
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-SVE-NEXT:    ptrue p0.d
+; CHECK-SVE-NEXT:    mov z30.d, z1.d
+; CHECK-SVE-NEXT:    mov z10.d, z1.d
+; CHECK-SVE-NEXT:    mov z11.d, z1.d
+; CHECK-SVE-NEXT:    mov z12.d, z1.d
+; CHECK-SVE-NEXT:    mov z2.d, z1.d
+; CHECK-SVE-NEXT:    and z6.d, z6.d, #0x10
+; CHECK-SVE-NEXT:    and z29.d, z29.d, #0x200
+; CHECK-SVE-NEXT:    mov z3.d, z1.d
+; CHECK-SVE-NEXT:    and z30.d, z30.d, #0x1000
+; CHECK-SVE-NEXT:    and z10.d, z10.d, #0x4000
+; CHECK-SVE-NEXT:    mov z4.d, z1.d
+; CHECK-SVE-NEXT:    and z11.d, z11.d, #0x8000
+; CHECK-SVE-NEXT:    and z12.d, z12.d, #0x10000
+; CHECK-SVE-NEXT:    and z2.d, z2.d, #0x2
+; CHECK-SVE-NEXT:    movprfx z24, z0
+; CHECK-SVE-NEXT:    mul z24.d, p0/m, z24.d, z6.d
+; CHECK-SVE-NEXT:    movprfx z6, z0
+; CHECK-SVE-NEXT:    mul z6.d, p0/m, z6.d, z29.d
+; CHECK-SVE-NEXT:    mov z29.d, z1.d
+; CHECK-SVE-NEXT:    movprfx z9, z0
+; CHECK-SVE-NEXT:    mul z9.d, p0/m, z9.d, z30.d
+; CHECK-SVE-NEXT:    movprfx z30, z0
+; CHECK-SVE-NEXT:    mul z30.d, p0/m, z30.d, z10.d
+; CHECK-SVE-NEXT:    mov z5.d, z1.d
+; CHECK-SVE-NEXT:    movprfx z10, z0
+; CHECK-SVE-NEXT:    mul z10.d, p0/m, z10.d, z12.d
+; CHECK-SVE-NEXT:    mul z2.d, p0/m, z2.d, z0.d
+; CHECK-SVE-NEXT:    mov z26.d, z1.d
+; CHECK-SVE-NEXT:    and z29.d, z29.d, #0x800
+; CHECK-SVE-NEXT:    mov z27.d, z1.d
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    mov z12.d, z1.d
+; CHECK-SVE-NEXT:    mov z15.d, z1.d
+; CHECK-SVE-NEXT:    mov z16.d, z1.d
+; CHECK-SVE-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-SVE-NEXT:    and z4.d, z4.d, #0x4
+; CHECK-SVE-NEXT:    and z5.d, z5.d, #0x8
+; CHECK-SVE-NEXT:    movprfx z8, z0
+; CHECK-SVE-NEXT:    mul z8.d, p0/m, z8.d, z29.d
+; CHECK-SVE-NEXT:    movprfx z29, z0
+; CHECK-SVE-NEXT:    mul z29.d, p0/m, z29.d, z11.d
+; CHECK-SVE-NEXT:    str z2, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    mov z11.d, z1.d
+; CHECK-SVE-NEXT:    and z26.d, z26.d, #0x40
+; CHECK-SVE-NEXT:    and z27.d, z27.d, #0x80
+; CHECK-SVE-NEXT:    mul z3.d, p0/m, z3.d, z0.d
+; CHECK-SVE-NEXT:    mul z4.d, p0/m, z4.d, z0.d
+; CHECK-SVE-NEXT:    movprfx z7, z0
+; CHECK-SVE-NEXT:    mul z7.d, p0/m, z7.d, z5.d
+; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x100
+; CHECK-SVE-NEXT:    and z12.d, z12.d, #0x40000
+; CHECK-SVE-NEXT:    and z15.d, z15.d, #0x100000
+; CHECK-SVE-NEXT:    movprfx z5, z0
+; CHECK-SVE-NEXT:    mul z5.d, p0/m, z5.d, z26.d
+; CHECK-SVE-NEXT:    movprfx z26, z0
+; CHECK-SVE-NEXT:    mul z26.d, p0/m, z26.d, z27.d
+; CHECK-SVE-NEXT:    and z11.d, z11.d, #0x20000
+; CHECK-SVE-NEXT:    and z16.d, z16.d, #0x200000
+; CHECK-SVE-NEXT:    mov z25.d, z1.d
+; CHECK-SVE-NEXT:    mov z31.d, z1.d
+; CHECK-SVE-NEXT:    movprfx z27, z0
+; CHECK-SVE-NEXT:    mul z27.d, p0/m, z27.d, z28.d
+; CHECK-SVE-NEXT:    movprfx z14, z0
+; CHECK-SVE-NEXT:    mul z14.d, p0/m, z14.d, z12.d
+; CHECK-SVE-NEXT:    movprfx z12, z0
+; CHECK-SVE-NEXT:    mul z12.d, p0/m, z12.d, z15.d
+; CHECK-SVE-NEXT:    movprfx z19, z0
+; CHECK-SVE-NEXT:    mul z19.d, p0/m, z19.d, z11.d
+; CHECK-SVE-NEXT:    str z3, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    mov z28.d, z1.d
+; CHECK-SVE-NEXT:    mov z13.d, z1.d
+; CHECK-SVE-NEXT:    movprfx z11, z0
+; CHECK-SVE-NEXT:    mul z11.d, p0/m, z11.d, z16.d
+; CHECK-SVE-NEXT:    mov z17.d, z1.d
+; CHECK-SVE-NEXT:    mov z18.d, z1.d
+; CHECK-SVE-NEXT:    str z4, [sp] // 16-byte Folded Spill
+; CHECK-SVE-NEXT:    mov z15.d, z1.d
+; CHECK-SVE-NEXT:    mov z16.d, z1.d
+; CHECK-SVE-NEXT:    ldr z4, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    mov z20.d, z1.d
+; CHECK-SVE-NEXT:    mov z21.d, z1.d
+; CHECK-SVE-NEXT:    mov z22.d, z1.d
+; CHECK-SVE-NEXT:    mov z23.d, z1.d
+; CHECK-SVE-NEXT:    mov z2.d, z1.d
+; CHECK-SVE-NEXT:    and z25.d, z25.d, #0x20
+; CHECK-SVE-NEXT:    and z28.d, z28.d, #0x400
+; CHECK-SVE-NEXT:    and z31.d, z31.d, #0x2000
+; CHECK-SVE-NEXT:    and z13.d, z13.d, #0x80000
+; CHECK-SVE-NEXT:    and z17.d, z17.d, #0x400000
+; CHECK-SVE-NEXT:    and z18.d, z18.d, #0x800000
+; CHECK-SVE-NEXT:    and z15.d, z15.d, #0x1000000
+; CHECK-SVE-NEXT:    mul z25.d, p0/m, z25.d, z0.d
+; CHECK-SVE-NEXT:    and z16.d, z16.d, #0x2000000
+; CHECK-SVE-NEXT:    and z20.d, z20.d, #0x4000000
+; CHECK-SVE-NEXT:    mul z28.d, p0/m, z28.d, z0.d
+; CHECK-SVE-NEXT:    mul z31.d, p0/m, z31.d, z0.d
+; CHECK-SVE-NEXT:    and z21.d, z21.d, #0x8000000
+; CHECK-SVE-NEXT:    mul z13.d, p0/m, z13.d, z0.d
+; CHECK-SVE-NEXT:    mul z17.d, p0/m, z17.d, z0.d
+; CHECK-SVE-NEXT:    and z22.d, z22.d, #0x10000000
+; CHECK-SVE-NEXT:    mul z18.d, p0/m, z18.d, z0.d
+; CHECK-SVE-NEXT:    mul z15.d, p0/m, z15.d, z0.d
+; CHECK-SVE-NEXT:    and z23.d, z23.d, #0x20000000
+; CHECK-SVE-NEXT:    and z2.d, z2.d, #0x40000000
+; CHECK-SVE-NEXT:    mul z16.d, p0/m, z16.d, z0.d
+; CHECK-SVE-NEXT:    mul z20.d, p0/m, z20.d, z0.d
+; CHECK-SVE-NEXT:    and z1.d, z1.d, #0x80000000
+; CHECK-SVE-NEXT:    mul z21.d, p0/m, z21.d, z0.d
+; CHECK-SVE-NEXT:    mul z22.d, p0/m, z22.d, z0.d
+; CHECK-SVE-NEXT:    mul z23.d, p0/m, z23.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z4.d, z3.d
+; CHECK-SVE-NEXT:    ldr z4, [sp] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    mul z2.d, p0/m, z2.d, z0.d
+; CHECK-SVE-NEXT:    mul z1.d, p0/m, z1.d, z0.d
+; CHECK-SVE-NEXT:    mul z0.d, z0.d, #0
+; CHECK-SVE-NEXT:    eor z4.d, z4.d, z7.d
+; CHECK-SVE-NEXT:    eor z7.d, z24.d, z25.d
+; CHECK-SVE-NEXT:    eor z24.d, z26.d, z27.d
+; CHECK-SVE-NEXT:    eor z25.d, z8.d, z9.d
+; CHECK-SVE-NEXT:    eor z26.d, z10.d, z19.d
+; CHECK-SVE-NEXT:    eor z27.d, z17.d, z18.d
+; CHECK-SVE-NEXT:    eor z2.d, z23.d, z2.d
+; CHECK-SVE-NEXT:    eor z8.d, z0.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
+; CHECK-SVE-NEXT:    eor z4.d, z7.d, z5.d
+; CHECK-SVE-NEXT:    eor z5.d, z24.d, z6.d
+; CHECK-SVE-NEXT:    eor z6.d, z25.d, z31.d
+; CHECK-SVE-NEXT:    eor z7.d, z26.d, z14.d
+; CHECK-SVE-NEXT:    eor z24.d, z27.d, z15.d
+; CHECK-SVE-NEXT:    eor z1.d, z2.d, z1.d
+; CHECK-SVE-NEXT:    eor z2.d, z8.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z28.d
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z30.d
+; CHECK-SVE-NEXT:    eor z6.d, z7.d, z13.d
+; CHECK-SVE-NEXT:    eor z7.d, z24.d, z16.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z29.d
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z12.d
+; CHECK-SVE-NEXT:    eor z6.d, z7.d, z20.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z11.d
+; CHECK-SVE-NEXT:    eor z5.d, z6.d, z21.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
+; CHECK-SVE-NEXT:    eor z4.d, z5.d, z22.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
+; CHECK-SVE-NEXT:    eor z3.d, z3.d, z4.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z2.d, z2.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z3.d, z1.d
+; CHECK-SVE-NEXT:    eor z3.d, z2.d, z0.d
+; CHECK-SVE-NEXT:    eor z1.d, z1.d, z3.d
+; CHECK-SVE-NEXT:    eor z0.d, z3.d, z0.d
+; CHECK-SVE-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    eor z0.d, z0.d, z2.d
+; CHECK-SVE-NEXT:    addvl sp, sp, #3
+; CHECK-SVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-SVE-NEXT:    addvl sp, sp, #16
+; CHECK-SVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2-LABEL: clmul_nxv2i64_zext:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z2.d, z2.d, #0x2
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x8
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x4
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x20
+; CHECK-SVE2-NEXT:    mul z2.d, z0.d, z2.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x10
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x80
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x40
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x200
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x100
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x800
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x400
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x2000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x1000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x8000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x4000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x20000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x10000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x80000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x40000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x200000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x100000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x800000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x400000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x2000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x1000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z4.d, z4.d, #0x8000000
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x4000000
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-NEXT:    and z3.d, z3.d, #0x20000000
+; CHECK-SVE2-NEXT:    and z6.d, z6.d, #0x10000000
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-NEXT:    and z1.d, z1.d, #0x40000000
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-NEXT:    mul z4.d, z0.d, z6.d
+; CHECK-SVE2-NEXT:    and z5.d, z5.d, #0x80000000
+; CHECK-SVE2-NEXT:    mul z1.d, z0.d, z1.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-NEXT:    mul z3.d, z0.d, z5.d
+; CHECK-SVE2-NEXT:    mul z0.d, z0.d, #0
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-AES-LABEL: clmul_nxv2i64_zext:
+; CHECK-SVE2-AES:       // %bb.0:
+; CHECK-SVE2-AES-NEXT:    mov z2.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z2.d, z2.d, #0x2
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x8
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x4
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x20
+; CHECK-SVE2-AES-NEXT:    mul z2.d, z0.d, z2.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    eor z2.d, z3.d, z2.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x10
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x80
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x40
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x200
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x100
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x800
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x400
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x2000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x1000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x8000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x4000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x20000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x10000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x80000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x40000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x200000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x100000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x800000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x400000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x2000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x1000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z4.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z6.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z4.d, z4.d, #0x8000000
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x4000000
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mul z5.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mov z3.d, z1.d
+; CHECK-SVE2-AES-NEXT:    mov z6.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z3.d, z3.d, #0x20000000
+; CHECK-SVE2-AES-NEXT:    and z6.d, z6.d, #0x10000000
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-SVE2-AES-NEXT:    mov z5.d, z1.d
+; CHECK-SVE2-AES-NEXT:    and z1.d, z1.d, #0x40000000
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z4.d, z0.d, z6.d
+; CHECK-SVE2-AES-NEXT:    and z5.d, z5.d, #0x80000000
+; CHECK-SVE2-AES-NEXT:    mul z1.d, z0.d, z1.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-SVE2-AES-NEXT:    mul z3.d, z0.d, z5.d
+; CHECK-SVE2-AES-NEXT:    mul z0.d, z0.d, #0
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-SVE2-AES-NEXT:    mov z0.d, z2.d
+; CHECK-SVE2-AES-NEXT:    ret
+  %zextx = zext <vscale x 2 x i32> %x to <vscale x 2 x i64>
+  %zexty = zext <vscale x 2 x i32> %y to <vscale x 2 x i64>
+  %a = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %zextx, <vscale x 2 x i64> %zexty)
   ret <vscale x 2 x i64> %a
 }
 
-attributes #0 = { "target-features"="+sve" }
-attributes #1 = { "target-features"="+sve2" }
-attributes #2 = { "target-features"="+sve2,+sve2-aes" }
+; TODO: fix
+; define <vscale x 1 x i128> @clmul_nxv1i128_zext(<vscale x 1 x i64> %x, <vscale x 1 x i64> %y) {
+;   %zextx = zext <vscale x 1 x i64> %x to <vscale x 1 x i128>
+;   %zexty = zext <vscale x 1 x i64> %y to <vscale x 1 x i128>
+;   %a = call <vscale x 1 x i128> @llvm.clmul.nxv1i128(<vscale x 1 x i128> %zextx, <vscale x 1 x i128> %zexty)
+;   ret <vscale x 1 x i128> %a
+; }
diff --git a/llvm/test/CodeGen/AArch64/clmul.ll b/llvm/test/CodeGen/AArch64/clmul.ll
index a7260523ae802..75f418f5bfb42 100644
--- a/llvm/test/CodeGen/AArch64/clmul.ll
+++ b/llvm/test/CodeGen/AArch64/clmul.ll
@@ -465,3 +465,759 @@ define i64 @clmul_i64(i64 %x, i64 %y) {
   %a = call i64 @llvm.clmul.i64(i64 %x, i64 %y)
   ret i64 %a
 }
+
+define i16 @clmul_i16_zext(i8 %x, i8 %y) {
+; CHECK-LABEL: clmul_i16_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    and w9, w1, #0x2
+; CHECK-NEXT:    and w10, w1, #0x1
+; CHECK-NEXT:    mul w9, w8, w9
+; CHECK-NEXT:    and w11, w1, #0x4
+; CHECK-NEXT:    and w12, w1, #0x8
+; CHECK-NEXT:    mul w10, w8, w10
+; CHECK-NEXT:    and w13, w1, #0x10
+; CHECK-NEXT:    and w14, w1, #0x20
+; CHECK-NEXT:    mul w11, w8, w11
+; CHECK-NEXT:    and w15, w1, #0x40
+; CHECK-NEXT:    mul w12, w8, w12
+; CHECK-NEXT:    mul w13, w8, w13
+; CHECK-NEXT:    eor w9, w10, w9
+; CHECK-NEXT:    and w10, w1, #0x80
+; CHECK-NEXT:    mul w14, w8, w14
+; CHECK-NEXT:    mul w15, w8, w15
+; CHECK-NEXT:    eor w11, w11, w12
+; CHECK-NEXT:    mul w8, w8, w10
+; CHECK-NEXT:    eor w9, w9, w11
+; CHECK-NEXT:    eor w12, w13, w14
+; CHECK-NEXT:    eor w10, w12, w15
+; CHECK-NEXT:    eor w9, w9, w10
+; CHECK-NEXT:    eor w0, w9, w8
+; CHECK-NEXT:    ret
+  %zextx = zext i8 %x to i16
+  %zexty = zext i8 %y to i16
+  %a = call i16 @llvm.clmul.i16(i16 %zextx, i16 %zexty)
+  ret i16 %a
+}
+
+define i32 @clmul_i32_zext(i16 %x, i16 %y) {
+; CHECK-LABEL: clmul_i32_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    and w9, w1, #0x2
+; CHECK-NEXT:    and w10, w1, #0x1
+; CHECK-NEXT:    mul w9, w8, w9
+; CHECK-NEXT:    and w11, w1, #0x4
+; CHECK-NEXT:    and w12, w1, #0x8
+; CHECK-NEXT:    mul w10, w8, w10
+; CHECK-NEXT:    and w13, w1, #0x10
+; CHECK-NEXT:    and w14, w1, #0x20
+; CHECK-NEXT:    mul w11, w8, w11
+; CHECK-NEXT:    and w16, w1, #0x80
+; CHECK-NEXT:    and w17, w1, #0x100
+; CHECK-NEXT:    mul w12, w8, w12
+; CHECK-NEXT:    and w2, w1, #0x800
+; CHECK-NEXT:    and w15, w1, #0x40
+; CHECK-NEXT:    mul w13, w8, w13
+; CHECK-NEXT:    eor w9, w10, w9
+; CHECK-NEXT:    and w10, w1, #0x1000
+; CHECK-NEXT:    mul w14, w8, w14
+; CHECK-NEXT:    and w18, w1, #0x200
+; CHECK-NEXT:    and w0, w1, #0x400
+; CHECK-NEXT:    mul w16, w8, w16
+; CHECK-NEXT:    eor w11, w11, w12
+; CHECK-NEXT:    and w12, w1, #0x2000
+; CHECK-NEXT:    mul w17, w8, w17
+; CHECK-NEXT:    eor w9, w9, w11
+; CHECK-NEXT:    and w11, w1, #0x4000
+; CHECK-NEXT:    mul w2, w8, w2
+; CHECK-NEXT:    eor w13, w13, w14
+; CHECK-NEXT:    and w14, w1, #0x8000
+; CHECK-NEXT:    mul w10, w8, w10
+; CHECK-NEXT:    mul w15, w8, w15
+; CHECK-NEXT:    eor w16, w16, w17
+; CHECK-NEXT:    mul w18, w8, w18
+; CHECK-NEXT:    mul w12, w8, w12
+; CHECK-NEXT:    eor w10, w2, w10
+; CHECK-NEXT:    mul w0, w8, w0
+; CHECK-NEXT:    eor w13, w13, w15
+; CHECK-NEXT:    mul w11, w8, w11
+; CHECK-NEXT:    eor w9, w9, w13
+; CHECK-NEXT:    mul w8, w8, w14
+; CHECK-NEXT:    eor w14, w16, w18
+; CHECK-NEXT:    eor w10, w10, w12
+; CHECK-NEXT:    eor w12, w14, w0
+; CHECK-NEXT:    eor w10, w10, w11
+; CHECK-NEXT:    eor w9, w9, w12
+; CHECK-NEXT:    eor w8, w10, w8
+; CHECK-NEXT:    eor w0, w9, w8
+; CHECK-NEXT:    ret
+  %zextx = zext i16 %x to i32
+  %zexty = zext i16 %y to i32
+  %a = call i32 @llvm.clmul.i32(i32 %zextx, i32 %zexty)
+  ret i32 %a
+}
+
+define i64 @clmul_i64_zext(i32 %x, i32 %y) {
+; CHECK-LABEL: clmul_i64_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    and x9, x8, #0x2
+; CHECK-NEXT:    and x10, x8, #0x1
+; CHECK-NEXT:    and x11, x8, #0x4
+; CHECK-NEXT:    umull x9, w9, w0
+; CHECK-NEXT:    and x12, x8, #0x8
+; CHECK-NEXT:    and x13, x8, #0x10
+; CHECK-NEXT:    umull x10, w10, w0
+; CHECK-NEXT:    and x14, x8, #0x20
+; CHECK-NEXT:    and x15, x8, #0x40
+; CHECK-NEXT:    umull x11, w11, w0
+; CHECK-NEXT:    and x2, x8, #0x800
+; CHECK-NEXT:    and x16, x8, #0x80
+; CHECK-NEXT:    umull x12, w12, w0
+; CHECK-NEXT:    and x17, x8, #0x100
+; CHECK-NEXT:    and x18, x8, #0x200
+; CHECK-NEXT:    umull x13, w13, w0
+; CHECK-NEXT:    eor x9, x10, x9
+; CHECK-NEXT:    and x10, x8, #0x1000
+; CHECK-NEXT:    umull x14, w14, w0
+; CHECK-NEXT:    and x1, x8, #0x400
+; CHECK-NEXT:    umull x15, w15, w0
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    and x12, x8, #0x2000
+; CHECK-NEXT:    umull x2, w2, w0
+; CHECK-NEXT:    eor x9, x9, x11
+; CHECK-NEXT:    and x11, x8, #0x4000
+; CHECK-NEXT:    umull x10, w10, w0
+; CHECK-NEXT:    eor x13, x13, x14
+; CHECK-NEXT:    and x14, x8, #0x8000
+; CHECK-NEXT:    umull x16, w16, w0
+; CHECK-NEXT:    eor x13, x13, x15
+; CHECK-NEXT:    and x15, x8, #0x10000
+; CHECK-NEXT:    umull x17, w17, w0
+; CHECK-NEXT:    eor x9, x9, x13
+; CHECK-NEXT:    and x13, x8, #0x20000
+; CHECK-NEXT:    umull x12, w12, w0
+; CHECK-NEXT:    eor x10, x2, x10
+; CHECK-NEXT:    and x2, x8, #0x400000
+; CHECK-NEXT:    umull x18, w18, w0
+; CHECK-NEXT:    umull x11, w11, w0
+; CHECK-NEXT:    eor x16, x16, x17
+; CHECK-NEXT:    and x17, x8, #0x40000
+; CHECK-NEXT:    umull x14, w14, w0
+; CHECK-NEXT:    eor x10, x10, x12
+; CHECK-NEXT:    and x12, x8, #0x800000
+; CHECK-NEXT:    umull x1, w1, w0
+; CHECK-NEXT:    eor x16, x16, x18
+; CHECK-NEXT:    and x18, x8, #0x80000
+; CHECK-NEXT:    umull x15, w15, w0
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    and x11, x8, #0x1000000
+; CHECK-NEXT:    umull x13, w13, w0
+; CHECK-NEXT:    eor x10, x10, x14
+; CHECK-NEXT:    and x14, x8, #0x2000000
+; CHECK-NEXT:    umull x17, w17, w0
+; CHECK-NEXT:    eor x16, x16, x1
+; CHECK-NEXT:    and x1, x8, #0x100000
+; CHECK-NEXT:    umull x2, w2, w0
+; CHECK-NEXT:    eor x9, x9, x16
+; CHECK-NEXT:    and x16, x8, #0x200000
+; CHECK-NEXT:    umull x12, w12, w0
+; CHECK-NEXT:    eor x13, x15, x13
+; CHECK-NEXT:    and x15, x8, #0x4000000
+; CHECK-NEXT:    umull x18, w18, w0
+; CHECK-NEXT:    eor x13, x13, x17
+; CHECK-NEXT:    and x17, x8, #0x8000000
+; CHECK-NEXT:    umull x11, w11, w0
+; CHECK-NEXT:    eor x9, x9, x10
+; CHECK-NEXT:    umull x14, w14, w0
+; CHECK-NEXT:    eor x12, x2, x12
+; CHECK-NEXT:    and x2, x8, #0x20000000
+; CHECK-NEXT:    umull x1, w1, w0
+; CHECK-NEXT:    eor x13, x13, x18
+; CHECK-NEXT:    and x18, x8, #0x10000000
+; CHECK-NEXT:    umull x15, w15, w0
+; CHECK-NEXT:    eor x11, x12, x11
+; CHECK-NEXT:    and x12, x8, #0x40000000
+; CHECK-NEXT:    umull x16, w16, w0
+; CHECK-NEXT:    eor x11, x11, x14
+; CHECK-NEXT:    and x8, x8, #0x80000000
+; CHECK-NEXT:    umull x17, w17, w0
+; CHECK-NEXT:    eor x13, x13, x1
+; CHECK-NEXT:    umull x18, w18, w0
+; CHECK-NEXT:    eor x11, x11, x15
+; CHECK-NEXT:    umull x2, w2, w0
+; CHECK-NEXT:    eor x10, x13, x16
+; CHECK-NEXT:    umull x12, w12, w0
+; CHECK-NEXT:    eor x11, x11, x17
+; CHECK-NEXT:    eor x9, x9, x10
+; CHECK-NEXT:    umull x8, w8, w0
+; CHECK-NEXT:    eor x10, x11, x18
+; CHECK-NEXT:    eor x9, x9, x10
+; CHECK-NEXT:    eor x11, x2, x12
+; CHECK-NEXT:    eor x8, x11, x8
+; CHECK-NEXT:    eor x0, x9, x8
+; CHECK-NEXT:    ret
+  %zextx = zext i32 %x to i64
+  %zexty = zext i32 %y to i64
+  %a = call i64 @llvm.clmul.i64(i64 %zextx, i64 %zexty)
+  ret i64 %a
+}
+
+define i128 @clmul_i128_zext(i64 %x, i64 %y) {
+; CHECK-LABEL: clmul_i128_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #624
+; CHECK-NEXT:    .cfi_def_cfa_offset 720
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w23, -40
+; CHECK-NEXT:    .cfi_offset w24, -48
+; CHECK-NEXT:    .cfi_offset w25, -56
+; CHECK-NEXT:    .cfi_offset w26, -64
+; CHECK-NEXT:    .cfi_offset w27, -72
+; CHECK-NEXT:    .cfi_offset w28, -80
+; CHECK-NEXT:    .cfi_offset w30, -88
+; CHECK-NEXT:    .cfi_offset w29, -96
+; CHECK-NEXT:    and x8, x1, #0x2
+; CHECK-NEXT:    mul x11, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x1
+; CHECK-NEXT:    mul x12, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x4
+; CHECK-NEXT:    mul x13, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x8
+; CHECK-NEXT:    mul x14, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x10
+; CHECK-NEXT:    mul x15, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x20
+; CHECK-NEXT:    mul x16, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x40
+; CHECK-NEXT:    mul x17, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x80
+; CHECK-NEXT:    mul x18, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x100
+; CHECK-NEXT:    mul x3, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x200
+; CHECK-NEXT:    mul x2, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x400
+; CHECK-NEXT:    mul x4, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x800
+; CHECK-NEXT:    mul x5, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x1000
+; CHECK-NEXT:    mul x20, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x2000
+; CHECK-NEXT:    mul x6, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x4000
+; CHECK-NEXT:    mul x7, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x8000
+; CHECK-NEXT:    mul x19, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x10000
+; CHECK-NEXT:    mul x21, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x20000
+; CHECK-NEXT:    mul x22, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x40000
+; CHECK-NEXT:    mul x23, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x80000
+; CHECK-NEXT:    mul x24, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x100000
+; CHECK-NEXT:    mul x25, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x200000
+; CHECK-NEXT:    mul x26, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x400000
+; CHECK-NEXT:    mul x27, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x800000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #592] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x1000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #584] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x2000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #616] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x4000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #576] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x8000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #608] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x10000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #600] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x20000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #568] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x40000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #512] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x80000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #536] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x100000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #528] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x200000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #560] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x400000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #520] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x800000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #552] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x1000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #544] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x2000000000
+; CHECK-NEXT:    mul x9, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x4000000000
+; CHECK-NEXT:    mul x10, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x8000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #464] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x10000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #456] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x20000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    stp x8, x9, [sp, #488] // 16-byte Folded Spill
+; CHECK-NEXT:    and x8, x1, #0x40000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    stp x10, x8, [sp, #440] // 16-byte Folded Spill
+; CHECK-NEXT:    and x8, x1, #0x80000000000
+; CHECK-NEXT:    mul x9, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x100000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    stp x8, x9, [sp, #472] // 16-byte Folded Spill
+; CHECK-NEXT:    and x8, x1, #0x200000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #504] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x400000000000
+; CHECK-NEXT:    mul x9, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x800000000000
+; CHECK-NEXT:    mul x10, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x1000000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #416] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x2000000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    str x8, [sp, #408] // 8-byte Spill
+; CHECK-NEXT:    and x8, x1, #0x4000000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    stp x8, x9, [sp, #424] // 16-byte Folded Spill
+; CHECK-NEXT:    and x8, x1, #0x8000000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    stp x10, x8, [sp, #392] // 16-byte Folded Spill
+; CHECK-NEXT:    and x8, x1, #0x100000000000000
+; CHECK-NEXT:    mul x9, x0, x8
+; CHECK-NEXT:    and x8, x1, #0x200000000000000
+; CHECK-NEXT:    mul x8, x0, x8
+; CHECK-NEXT:    stp x8, x9, [sp, #376] // 16-byte Folded Spill
+; CHECK-NEXT:    and x9, x1, #0x400000000000000
+; CHECK-NEXT:    rbit x8, x1
+; CHECK-NEXT:    mul x9, x0, x9
+; CHECK-NEXT:    and x10, x8, #0x2
+; CHECK-NEXT:    str x9, [sp, #368] // 8-byte Spill
+; CHECK-NEXT:    rbit x9, x0
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #360] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x1
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #352] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x4
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #344] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x8
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #336] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x10
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #328] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x20
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #320] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x40
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #312] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x80
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #304] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x100
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #296] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x200
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #288] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x400
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #280] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x800
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #272] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x1000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #256] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x2000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #248] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x4000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #264] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x8000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #240] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x10000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #232] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x20000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #200] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x40000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #224] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x80000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #192] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x100000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #216] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x200000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #208] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x400000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #184] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x800000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #136] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x1000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #168] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x2000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #160] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x4000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #176] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x8000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #152] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x10000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #144] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x20000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #128] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x40000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #120] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x80000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #112] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x100000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #104] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x200000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #96] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x400000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #88] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x800000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #80] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x1000000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #72] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x2000000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #64] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x4000000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #56] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x8000000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #48] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x10000000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #40] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x20000000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #32] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x40000000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #24] // 8-byte Spill
+; CHECK-NEXT:    and x10, x8, #0x80000000000
+; CHECK-NEXT:    mul x10, x9, x10
+; CHECK-NEXT:    str x10, [sp, #16] // 8-byte Spill
+; CHECK-NEXT:    eor x10, x12, x11
+; CHECK-NEXT:    and x12, x8, #0x100000000000
+; CHECK-NEXT:    mul x12, x9, x12
+; CHECK-NEXT:    eor x11, x13, x14
+; CHECK-NEXT:    and x13, x8, #0x200000000000
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    eor x11, x15, x16
+; CHECK-NEXT:    ldr x14, [sp, #608] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x17
+; CHECK-NEXT:    mul x30, x9, x13
+; CHECK-NEXT:    and x13, x8, #0x400000000000
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    mul x29, x9, x13
+; CHECK-NEXT:    and x13, x8, #0x800000000000
+; CHECK-NEXT:    str x12, [sp, #8] // 8-byte Spill
+; CHECK-NEXT:    eor x12, x18, x3
+; CHECK-NEXT:    eor x11, x12, x2
+; CHECK-NEXT:    eor x12, x5, x20
+; CHECK-NEXT:    mul x28, x9, x13
+; CHECK-NEXT:    eor x11, x11, x4
+; CHECK-NEXT:    ldr x13, [sp, #592] // 8-byte Reload
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    eor x11, x12, x6
+; CHECK-NEXT:    eor x12, x21, x22
+; CHECK-NEXT:    eor x11, x11, x7
+; CHECK-NEXT:    eor x12, x12, x23
+; CHECK-NEXT:    eor x13, x27, x13
+; CHECK-NEXT:    eor x11, x11, x19
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    eor x11, x12, x24
+; CHECK-NEXT:    and x12, x8, #0x1000000000000
+; CHECK-NEXT:    eor x11, x11, x25
+; CHECK-NEXT:    mul x27, x9, x12
+; CHECK-NEXT:    ldr x12, [sp, #584] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x26
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    ldr x11, [sp, #616] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x13, x12
+; CHECK-NEXT:    and x13, x8, #0x2000000000000
+; CHECK-NEXT:    eor x11, x12, x11
+; CHECK-NEXT:    ldr x12, [sp, #576] // 8-byte Reload
+; CHECK-NEXT:    mul x25, x9, x13
+; CHECK-NEXT:    ldr x13, [sp, #512] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    ldr x12, [sp, #568] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x14
+; CHECK-NEXT:    ldr x14, [sp, #536] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x12, x13
+; CHECK-NEXT:    and x13, x8, #0x4000000000000
+; CHECK-NEXT:    mul x24, x9, x13
+; CHECK-NEXT:    ldr x13, [sp, #600] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x12, x14
+; CHECK-NEXT:    ldr x14, [sp, #552] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x13
+; CHECK-NEXT:    ldr x13, [sp, #528] // 8-byte Reload
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    ldr x11, [sp, #560] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x12, x13
+; CHECK-NEXT:    and x13, x8, #0x8000000000000
+; CHECK-NEXT:    eor x11, x12, x11
+; CHECK-NEXT:    ldr x12, [sp, #520] // 8-byte Reload
+; CHECK-NEXT:    mul x23, x9, x13
+; CHECK-NEXT:    ldr x13, [sp, #440] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    ldr x12, [sp, #496] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x14
+; CHECK-NEXT:    ldr x14, [sp, #464] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x12, x13
+; CHECK-NEXT:    and x13, x8, #0x10000000000000
+; CHECK-NEXT:    mul x21, x9, x13
+; CHECK-NEXT:    ldr x13, [sp, #544] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x12, x14
+; CHECK-NEXT:    ldr x14, [sp, #256] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x13
+; CHECK-NEXT:    ldr x13, [sp, #456] // 8-byte Reload
+; CHECK-NEXT:    eor x26, x10, x11
+; CHECK-NEXT:    ldr x10, [sp, #488] // 8-byte Reload
+; CHECK-NEXT:    ldr x11, [sp, #448] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x12, x13
+; CHECK-NEXT:    and x13, x8, #0x20000000000000
+; CHECK-NEXT:    eor x10, x12, x10
+; CHECK-NEXT:    mul x20, x9, x13
+; CHECK-NEXT:    ldr x12, [sp, #392] // 8-byte Reload
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    ldr x11, [sp, #432] // 8-byte Reload
+; CHECK-NEXT:    ldr x13, [sp, #480] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    and x12, x8, #0x40000000000000
+; CHECK-NEXT:    eor x10, x10, x13
+; CHECK-NEXT:    ldr x13, [sp, #416] // 8-byte Reload
+; CHECK-NEXT:    mul x7, x9, x12
+; CHECK-NEXT:    ldr x12, [sp, #472] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x13
+; CHECK-NEXT:    ldr x13, [sp, #504] // 8-byte Reload
+; CHECK-NEXT:    eor x10, x10, x12
+; CHECK-NEXT:    ldr x12, [sp, #408] // 8-byte Reload
+; CHECK-NEXT:    eor x22, x10, x13
+; CHECK-NEXT:    ldr x10, [sp, #424] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    and x12, x8, #0x80000000000000
+; CHECK-NEXT:    eor x10, x11, x10
+; CHECK-NEXT:    ldr x11, [sp, #400] // 8-byte Reload
+; CHECK-NEXT:    mul x5, x9, x12
+; CHECK-NEXT:    ldr x12, [sp, #368] // 8-byte Reload
+; CHECK-NEXT:    eor x19, x10, x11
+; CHECK-NEXT:    ldp x11, x10, [sp, #376] // 16-byte Folded Reload
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    and x11, x8, #0x100000000000000
+; CHECK-NEXT:    eor x6, x10, x12
+; CHECK-NEXT:    ldp x12, x10, [sp, #352] // 16-byte Folded Reload
+; CHECK-NEXT:    mul x4, x9, x11
+; CHECK-NEXT:    eor x10, x12, x10
+; CHECK-NEXT:    ldp x12, x11, [sp, #336] // 16-byte Folded Reload
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    ldp x13, x12, [sp, #320] // 16-byte Folded Reload
+; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    ldr x11, [sp, #312] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x12, x13
+; CHECK-NEXT:    and x13, x8, #0x200000000000000
+; CHECK-NEXT:    eor x11, x12, x11
+; CHECK-NEXT:    mul x3, x9, x13
+; CHECK-NEXT:    and x13, x8, #0x400000000000000
+; CHECK-NEXT:    eor x12, x10, x11
+; CHECK-NEXT:    ldp x11, x10, [sp, #296] // 16-byte Folded Reload
+; CHECK-NEXT:    mul x2, x9, x13
+; CHECK-NEXT:    ldr x13, [sp, #280] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x10, x11
+; CHECK-NEXT:    ldr x10, [sp, #288] // 8-byte Reload
+; CHECK-NEXT:    eor x3, x4, x3
+; CHECK-NEXT:    and x4, x1, #0x2000000000000000
+; CHECK-NEXT:    eor x11, x11, x10
+; CHECK-NEXT:    ldr x10, [sp, #272] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x13
+; CHECK-NEXT:    ldr x13, [sp, #248] // 8-byte Reload
+; CHECK-NEXT:    eor x2, x3, x2
+; CHECK-NEXT:    eor x10, x10, x14
+; CHECK-NEXT:    eor x11, x12, x11
+; CHECK-NEXT:    ldr x12, [sp, #264] // 8-byte Reload
+; CHECK-NEXT:    eor x10, x10, x13
+; CHECK-NEXT:    and x13, x8, #0x800000000000000
+; CHECK-NEXT:    mul x3, x0, x4
+; CHECK-NEXT:    eor x12, x10, x12
+; CHECK-NEXT:    ldr x10, [sp, #240] // 8-byte Reload
+; CHECK-NEXT:    mul x18, x9, x13
+; CHECK-NEXT:    ldr x13, [sp, #200] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x12, x10
+; CHECK-NEXT:    ldr x10, [sp, #232] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    ldp x14, x12, [sp, #216] // 16-byte Folded Reload
+; CHECK-NEXT:    eor x13, x10, x13
+; CHECK-NEXT:    and x10, x8, #0x1000000000000000
+; CHECK-NEXT:    mul x17, x9, x10
+; CHECK-NEXT:    ldr x10, [sp, #192] // 8-byte Reload
+; CHECK-NEXT:    eor x12, x13, x12
+; CHECK-NEXT:    ldr x13, [sp, #136] // 8-byte Reload
+; CHECK-NEXT:    eor x18, x2, x18
+; CHECK-NEXT:    eor x12, x12, x10
+; CHECK-NEXT:    ldr x10, [sp, #184] // 8-byte Reload
+; CHECK-NEXT:    and x2, x1, #0x4000000000000000
+; CHECK-NEXT:    eor x12, x12, x14
+; CHECK-NEXT:    ldr x14, [sp, #168] // 8-byte Reload
+; CHECK-NEXT:    eor x13, x10, x13
+; CHECK-NEXT:    and x10, x8, #0x2000000000000000
+; CHECK-NEXT:    and x8, x8, #0x4000000000000000
+; CHECK-NEXT:    mul x16, x9, x10
+; CHECK-NEXT:    ldr x10, [sp, #208] // 8-byte Reload
+; CHECK-NEXT:    eor x13, x13, x14
+; CHECK-NEXT:    eor x17, x18, x17
+; CHECK-NEXT:    eor x12, x12, x10
+; CHECK-NEXT:    ldr x10, [sp, #160] // 8-byte Reload
+; CHECK-NEXT:    mul x15, x9, x8
+; CHECK-NEXT:    ldr x8, [sp, #152] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    eor x13, x13, x10
+; CHECK-NEXT:    ldr x10, [sp, #176] // 8-byte Reload
+; CHECK-NEXT:    mul x18, x0, x2
+; CHECK-NEXT:    eor x16, x17, x16
+; CHECK-NEXT:    and x17, x1, #0x8000000000000000
+; CHECK-NEXT:    eor x12, x13, x10
+; CHECK-NEXT:    and x13, x1, #0x10000000000000
+; CHECK-NEXT:    eor x9, x12, x8
+; CHECK-NEXT:    ldp x10, x8, [sp, #120] // 16-byte Folded Reload
+; CHECK-NEXT:    mul x14, x0, x13
+; CHECK-NEXT:    eor x12, x8, x10
+; CHECK-NEXT:    ldr x8, [sp, #144] // 8-byte Reload
+; CHECK-NEXT:    eor x9, x9, x8
+; CHECK-NEXT:    ldp x8, x10, [sp, #104] // 16-byte Folded Reload
+; CHECK-NEXT:    eor x9, x11, x9
+; CHECK-NEXT:    eor x14, x19, x14
+; CHECK-NEXT:    eor x12, x12, x10
+; CHECK-NEXT:    eor x11, x12, x8
+; CHECK-NEXT:    ldr x8, [sp, #96] // 8-byte Reload
+; CHECK-NEXT:    and x12, x1, #0x20000000000000
+; CHECK-NEXT:    mul x13, x0, x12
+; CHECK-NEXT:    eor x11, x11, x8
+; CHECK-NEXT:    ldp x10, x8, [sp, #56] // 16-byte Folded Reload
+; CHECK-NEXT:    eor x8, x8, x10
+; CHECK-NEXT:    ldp x12, x10, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    eor x11, x11, x10
+; CHECK-NEXT:    ldr x10, [sp, #48] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x12
+; CHECK-NEXT:    ldr x12, [sp, #40] // 8-byte Reload
+; CHECK-NEXT:    eor x8, x8, x10
+; CHECK-NEXT:    and x10, x1, #0x40000000000000
+; CHECK-NEXT:    eor x8, x8, x12
+; CHECK-NEXT:    mul x12, x0, x10
+; CHECK-NEXT:    ldr x10, [sp, #72] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x10
+; CHECK-NEXT:    ldr x10, [sp, #32] // 8-byte Reload
+; CHECK-NEXT:    eor x9, x9, x11
+; CHECK-NEXT:    ldr x11, [sp, #24] // 8-byte Reload
+; CHECK-NEXT:    eor x8, x8, x10
+; CHECK-NEXT:    and x10, x1, #0x80000000000000
+; CHECK-NEXT:    eor x8, x8, x11
+; CHECK-NEXT:    ldr x11, [sp, #16] // 8-byte Reload
+; CHECK-NEXT:    mul x10, x0, x10
+; CHECK-NEXT:    eor x8, x8, x11
+; CHECK-NEXT:    eor x11, x29, x28
+; CHECK-NEXT:    ldr x29, [sp, #8] // 8-byte Reload
+; CHECK-NEXT:    eor x11, x11, x27
+; CHECK-NEXT:    and x28, x1, #0x800000000000000
+; CHECK-NEXT:    eor x8, x8, x29
+; CHECK-NEXT:    eor x11, x11, x25
+; CHECK-NEXT:    mul x27, x0, x28
+; CHECK-NEXT:    eor x8, x8, x30
+; CHECK-NEXT:    and x25, x1, #0x1000000000000000
+; CHECK-NEXT:    eor x8, x9, x8
+; CHECK-NEXT:    eor x9, x11, x24
+; CHECK-NEXT:    mul x11, x0, x25
+; CHECK-NEXT:    eor x9, x9, x23
+; CHECK-NEXT:    eor x9, x9, x21
+; CHECK-NEXT:    eor x9, x9, x20
+; CHECK-NEXT:    eor x9, x9, x7
+; CHECK-NEXT:    eor x9, x9, x5
+; CHECK-NEXT:    eor x8, x8, x9
+; CHECK-NEXT:    eor x9, x16, x15
+; CHECK-NEXT:    mul x15, x0, x17
+; CHECK-NEXT:    eor x16, x6, x27
+; CHECK-NEXT:    eor x8, x8, x9
+; CHECK-NEXT:    eor x9, x14, x13
+; CHECK-NEXT:    eor x11, x16, x11
+; CHECK-NEXT:    rbit x8, x8
+; CHECK-NEXT:    eor x9, x9, x12
+; CHECK-NEXT:    eor x11, x11, x3
+; CHECK-NEXT:    eor x13, x26, x22
+; CHECK-NEXT:    eor x9, x9, x10
+; CHECK-NEXT:    eor x10, x11, x18
+; CHECK-NEXT:    lsr x1, x8, #1
+; CHECK-NEXT:    eor x8, x13, x9
+; CHECK-NEXT:    eor x9, x10, x15
+; CHECK-NEXT:    eor x0, x8, x9
+; CHECK-NEXT:    add sp, sp, #624
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %zextx = zext i64 %x to i128
+  %zexty = zext i64 %y to i128
+  %a = call i128 @llvm.clmul.i128(i128 %zextx, i128 %zexty)
+  ret i128 %a
+}



More information about the llvm-commits mailing list