[llvm] Add clmul zext AArch64 lowering tests (PR #179641)
Matthew Devereau via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 4 03:08:07 PST 2026
https://github.com/MDevereau created https://github.com/llvm/llvm-project/pull/179641
None
>From d4302bb47794182a8522b223387c88c99bed71ff Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Wed, 4 Feb 2026 11:05:48 +0000
Subject: [PATCH] Add clmul zext AArch64 lowering tests
---
llvm/test/CodeGen/AArch64/clmul-fixed.ll | 795 ++++++++++++++++++-
llvm/test/CodeGen/AArch64/clmul-scalable.ll | 813 ++++++++++++++++++++
llvm/test/CodeGen/AArch64/clmul.ll | 220 ++++++
3 files changed, 1827 insertions(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
index ff5eeb6cdb43d..23a1614f194a8 100644
--- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
@@ -455,4 +455,797 @@ define <2 x i32> @clmul_v2i32_neon(<2 x i32> %x, <2 x i32> %y) {
; ret <1 x i64> %a
; }
-attributes #0 = { "target-features"="+aes" }
\ No newline at end of file
+define <16 x i8> @clmul_v16i8_neon_zext(<16 x i4> %x, <16 x i4> %y) {
+; CHECK-LABEL: clmul_v16i8_neon_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.16b, #15
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: pmul v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %zextx = zext <16 x i4> %x to <16 x i8>
+ %zexty = zext <16 x i4> %y to <16 x i8>
+ %a = call <16 x i8> @llvm.clmul.v16i8(<16 x i8> %zextx, <16 x i8> %zexty)
+ ret <16 x i8> %a
+}
+
+define <8 x i8> @clmul_v8i8_neon_zext(<8 x i4> %x, <8 x i4> %y) {
+; CHECK-LABEL: clmul_v8i8_neon_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.8b, #15
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+ %zextx = zext <8 x i4> %x to <8 x i8>
+ %zexty = zext <8 x i4> %y to <8 x i8>
+ %a = call <8 x i8> @llvm.clmul.v8i8(<8 x i8> %zextx, <8 x i8> %zexty)
+ ret <8 x i8> %a
+}
+
+define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
+; CHECK-LABEL: clmul_v8i16_neon_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.8h, #2
+; CHECK-NEXT: movi v3.8h, #1
+; CHECK-NEXT: movi v4.8h, #4
+; CHECK-NEXT: movi v5.8h, #8
+; CHECK-NEXT: movi v6.8h, #16
+; CHECK-NEXT: movi v7.8h, #32
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: movi v16.8h, #64
+; CHECK-NEXT: movi v17.8h, #128
+; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v17.16b
+; CHECK-NEXT: xtn v2.8b, v2.8h
+; CHECK-NEXT: xtn v3.8b, v3.8h
+; CHECK-NEXT: xtn v4.8b, v4.8h
+; CHECK-NEXT: xtn v5.8b, v5.8h
+; CHECK-NEXT: xtn v6.8b, v6.8h
+; CHECK-NEXT: xtn v7.8b, v7.8h
+; CHECK-NEXT: xtn v16.8b, v16.8h
+; CHECK-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEXT: umull v2.8h, v0.8b, v2.8b
+; CHECK-NEXT: umull v3.8h, v0.8b, v3.8b
+; CHECK-NEXT: umull v4.8h, v0.8b, v4.8b
+; CHECK-NEXT: umull v5.8h, v0.8b, v5.8b
+; CHECK-NEXT: umull v6.8h, v0.8b, v6.8b
+; CHECK-NEXT: umull v7.8h, v0.8b, v7.8b
+; CHECK-NEXT: umull v16.8h, v0.8b, v16.8b
+; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
+; CHECK-NEXT: eor v4.16b, v6.16b, v7.16b
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v16.16b
+; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %zextx = zext <8 x i8> %x to <8 x i16>
+ %zexty = zext <8 x i8> %y to <8 x i16>
+ %a = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %zextx, <8 x i16> %zexty)
+ ret <8 x i16> %a
+}
+
+define <4 x i16> @clmul_v4i16_neon_zext(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: clmul_v4i16_neon_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.4h, #2
+; CHECK-NEXT: movi v3.4h, #1
+; CHECK-NEXT: movi v4.4h, #4
+; CHECK-NEXT: movi v5.4h, #8
+; CHECK-NEXT: movi v6.4h, #16
+; CHECK-NEXT: movi v7.4h, #32
+; CHECK-NEXT: bic v1.4h, #255, lsl #8
+; CHECK-NEXT: movi v16.4h, #64
+; CHECK-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-NEXT: movi v17.4h, #128
+; CHECK-NEXT: and v2.8b, v1.8b, v2.8b
+; CHECK-NEXT: and v3.8b, v1.8b, v3.8b
+; CHECK-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEXT: and v5.8b, v1.8b, v5.8b
+; CHECK-NEXT: and v6.8b, v1.8b, v6.8b
+; CHECK-NEXT: and v7.8b, v1.8b, v7.8b
+; CHECK-NEXT: and v16.8b, v1.8b, v16.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v17.8b
+; CHECK-NEXT: mul v2.4h, v0.4h, v2.4h
+; CHECK-NEXT: mul v3.4h, v0.4h, v3.4h
+; CHECK-NEXT: mul v4.4h, v0.4h, v4.4h
+; CHECK-NEXT: mul v5.4h, v0.4h, v5.4h
+; CHECK-NEXT: mul v6.4h, v0.4h, v6.4h
+; CHECK-NEXT: mul v7.4h, v0.4h, v7.4h
+; CHECK-NEXT: mul v16.4h, v0.4h, v16.4h
+; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: eor v3.8b, v4.8b, v5.8b
+; CHECK-NEXT: eor v4.8b, v6.8b, v7.8b
+; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT: eor v3.8b, v4.8b, v16.8b
+; CHECK-NEXT: eor v1.8b, v2.8b, v3.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ret
+ %zextx = zext <4 x i8> %x to <4 x i16>
+ %zexty = zext <4 x i8> %y to <4 x i16>
+ %a = call <4 x i16> @llvm.clmul.v4i16(<4 x i16> %zextx, <4 x i16> %zexty)
+ ret <4 x i16> %a
+}
+
+define <4 x i32> @clmul_v4i32_neon_zext(<4 x i16> %x, <4 x i16> %y) {
+; CHECK-LABEL: clmul_v4i32_neon_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.4s, #2
+; CHECK-NEXT: movi v3.4s, #1
+; CHECK-NEXT: movi v4.4s, #4
+; CHECK-NEXT: movi v5.4s, #8
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: movi v6.4s, #16
+; CHECK-NEXT: movi v7.4s, #32
+; CHECK-NEXT: movi v16.4s, #128
+; CHECK-NEXT: movi v17.4s, #1, lsl #8
+; CHECK-NEXT: movi v18.4s, #8, lsl #8
+; CHECK-NEXT: movi v19.4s, #16, lsl #8
+; CHECK-NEXT: movi v20.4s, #64
+; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: movi v21.4s, #2, lsl #8
+; CHECK-NEXT: movi v22.4s, #32, lsl #8
+; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
+; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT: xtn v2.4h, v2.4s
+; CHECK-NEXT: xtn v3.4h, v3.4s
+; CHECK-NEXT: xtn v4.4h, v4.4s
+; CHECK-NEXT: xtn v5.4h, v5.4s
+; CHECK-NEXT: movi v23.4s, #4, lsl #8
+; CHECK-NEXT: movi v24.4s, #64, lsl #8
+; CHECK-NEXT: xtn v6.4h, v6.4s
+; CHECK-NEXT: xtn v7.4h, v7.4s
+; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT: xtn v16.4h, v16.4s
+; CHECK-NEXT: xtn v17.4h, v17.4s
+; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT: xtn v18.4h, v18.4s
+; CHECK-NEXT: xtn v19.4h, v19.4s
+; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
+; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
+; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h
+; CHECK-NEXT: umull v4.4s, v0.4h, v4.4h
+; CHECK-NEXT: umull v5.4s, v0.4h, v5.4h
+; CHECK-NEXT: movi v25.4s, #128, lsl #8
+; CHECK-NEXT: xtn v20.4h, v20.4s
+; CHECK-NEXT: xtn v21.4h, v21.4s
+; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
+; CHECK-NEXT: xtn v22.4h, v22.4s
+; CHECK-NEXT: and v24.16b, v1.16b, v24.16b
+; CHECK-NEXT: umull v6.4s, v0.4h, v6.4h
+; CHECK-NEXT: umull v7.4s, v0.4h, v7.4h
+; CHECK-NEXT: umull v16.4s, v0.4h, v16.4h
+; CHECK-NEXT: umull v17.4s, v0.4h, v17.4h
+; CHECK-NEXT: umull v18.4s, v0.4h, v18.4h
+; CHECK-NEXT: umull v19.4s, v0.4h, v19.4h
+; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v25.16b
+; CHECK-NEXT: xtn v4.4h, v23.4s
+; CHECK-NEXT: xtn v5.4h, v24.4s
+; CHECK-NEXT: umull v20.4s, v0.4h, v20.4h
+; CHECK-NEXT: umull v21.4s, v0.4h, v21.4h
+; CHECK-NEXT: umull v22.4s, v0.4h, v22.4h
+; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT: eor v7.16b, v16.16b, v17.16b
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v16.16b, v18.16b, v19.16b
+; CHECK-NEXT: xtn v1.4h, v1.4s
+; CHECK-NEXT: umull v3.4s, v0.4h, v4.4h
+; CHECK-NEXT: umull v4.4s, v0.4h, v5.4h
+; CHECK-NEXT: eor v5.16b, v6.16b, v20.16b
+; CHECK-NEXT: eor v6.16b, v7.16b, v21.16b
+; CHECK-NEXT: eor v7.16b, v16.16b, v22.16b
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: eor v1.16b, v2.16b, v5.16b
+; CHECK-NEXT: eor v2.16b, v6.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v7.16b, v4.16b
+; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %zextx = zext <4 x i16> %x to <4 x i32>
+ %zexty = zext <4 x i16> %y to <4 x i32>
+ %a = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %zextx, <4 x i32> %zexty)
+ ret <4 x i32> %a
+}
+
+define <2 x i32> @clmul_v2i32_neon_zext(<2 x i16> %x, <2 x i16> %y) {
+; CHECK-LABEL: clmul_v2i32_neon_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d2, #0x00ffff0000ffff
+; CHECK-NEXT: movi v3.2s, #2
+; CHECK-NEXT: movi v4.2s, #1
+; CHECK-NEXT: movi v5.2s, #4
+; CHECK-NEXT: movi v6.2s, #8
+; CHECK-NEXT: movi v7.2s, #16
+; CHECK-NEXT: movi v16.2s, #32
+; CHECK-NEXT: movi v17.2s, #64
+; CHECK-NEXT: movi v18.2s, #128
+; CHECK-NEXT: movi v19.2s, #2, lsl #8
+; CHECK-NEXT: movi v20.2s, #8, lsl #8
+; CHECK-NEXT: movi v21.2s, #128, lsl #16
+; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: movi v22.2s, #8, lsl #16
+; CHECK-NEXT: movi v23.2s, #2, lsl #24
+; CHECK-NEXT: movi v25.2s, #4, lsl #24
+; CHECK-NEXT: movi v24.2s, #32, lsl #16
+; CHECK-NEXT: movi v26.2s, #8, lsl #24
+; CHECK-NEXT: and v2.8b, v1.8b, v3.8b
+; CHECK-NEXT: and v3.8b, v1.8b, v4.8b
+; CHECK-NEXT: and v4.8b, v1.8b, v5.8b
+; CHECK-NEXT: and v5.8b, v1.8b, v6.8b
+; CHECK-NEXT: and v7.8b, v1.8b, v7.8b
+; CHECK-NEXT: and v16.8b, v1.8b, v16.8b
+; CHECK-NEXT: movi v6.2s, #1, lsl #8
+; CHECK-NEXT: and v17.8b, v1.8b, v17.8b
+; CHECK-NEXT: and v18.8b, v1.8b, v18.8b
+; CHECK-NEXT: mul v2.2s, v0.2s, v2.2s
+; CHECK-NEXT: mul v3.2s, v0.2s, v3.2s
+; CHECK-NEXT: mul v4.2s, v0.2s, v4.2s
+; CHECK-NEXT: mul v5.2s, v0.2s, v5.2s
+; CHECK-NEXT: mul v7.2s, v0.2s, v7.2s
+; CHECK-NEXT: mul v16.2s, v0.2s, v16.2s
+; CHECK-NEXT: and v21.8b, v1.8b, v21.8b
+; CHECK-NEXT: and v23.8b, v1.8b, v23.8b
+; CHECK-NEXT: and v6.8b, v1.8b, v6.8b
+; CHECK-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: eor v3.8b, v4.8b, v5.8b
+; CHECK-NEXT: movi v4.2s, #16, lsl #8
+; CHECK-NEXT: mul v5.2s, v0.2s, v17.2s
+; CHECK-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT: mul v17.2s, v0.2s, v18.2s
+; CHECK-NEXT: mul v6.2s, v0.2s, v6.2s
+; CHECK-NEXT: and v16.8b, v1.8b, v19.8b
+; CHECK-NEXT: movi v19.2s, #32, lsl #8
+; CHECK-NEXT: and v18.8b, v1.8b, v20.8b
+; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT: movi v20.2s, #64, lsl #8
+; CHECK-NEXT: mul v21.2s, v0.2s, v21.2s
+; CHECK-NEXT: and v3.8b, v1.8b, v4.8b
+; CHECK-NEXT: eor v5.8b, v7.8b, v5.8b
+; CHECK-NEXT: movi v7.2s, #2, lsl #16
+; CHECK-NEXT: movi v4.2s, #1, lsl #16
+; CHECK-NEXT: mul v16.2s, v0.2s, v16.2s
+; CHECK-NEXT: eor v6.8b, v17.8b, v6.8b
+; CHECK-NEXT: mul v18.2s, v0.2s, v18.2s
+; CHECK-NEXT: and v19.8b, v1.8b, v19.8b
+; CHECK-NEXT: movi v17.2s, #4, lsl #8
+; CHECK-NEXT: mul v3.2s, v0.2s, v3.2s
+; CHECK-NEXT: eor v2.8b, v2.8b, v5.8b
+; CHECK-NEXT: and v5.8b, v1.8b, v7.8b
+; CHECK-NEXT: movi v7.2s, #64, lsl #16
+; CHECK-NEXT: and v4.8b, v1.8b, v4.8b
+; CHECK-NEXT: eor v6.8b, v6.8b, v16.8b
+; CHECK-NEXT: mul v16.2s, v0.2s, v19.2s
+; CHECK-NEXT: movi v19.2s, #4, lsl #16
+; CHECK-NEXT: and v17.8b, v1.8b, v17.8b
+; CHECK-NEXT: eor v3.8b, v18.8b, v3.8b
+; CHECK-NEXT: and v18.8b, v1.8b, v20.8b
+; CHECK-NEXT: movi v20.2s, #1, lsl #24
+; CHECK-NEXT: and v7.8b, v1.8b, v7.8b
+; CHECK-NEXT: mul v4.2s, v0.2s, v4.2s
+; CHECK-NEXT: mul v5.2s, v0.2s, v5.2s
+; CHECK-NEXT: mul v17.2s, v0.2s, v17.2s
+; CHECK-NEXT: eor v3.8b, v3.8b, v16.8b
+; CHECK-NEXT: and v16.8b, v1.8b, v19.8b
+; CHECK-NEXT: mul v18.2s, v0.2s, v18.2s
+; CHECK-NEXT: mul v7.2s, v0.2s, v7.2s
+; CHECK-NEXT: and v20.8b, v1.8b, v20.8b
+; CHECK-NEXT: movi v19.2s, #128, lsl #8
+; CHECK-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT: mul v5.2s, v0.2s, v16.2s
+; CHECK-NEXT: and v16.8b, v1.8b, v22.8b
+; CHECK-NEXT: movi v22.2s, #16, lsl #16
+; CHECK-NEXT: mul v20.2s, v0.2s, v20.2s
+; CHECK-NEXT: eor v6.8b, v6.8b, v17.8b
+; CHECK-NEXT: eor v3.8b, v3.8b, v18.8b
+; CHECK-NEXT: eor v7.8b, v7.8b, v21.8b
+; CHECK-NEXT: and v17.8b, v1.8b, v19.8b
+; CHECK-NEXT: mul v18.2s, v0.2s, v23.2s
+; CHECK-NEXT: and v19.8b, v1.8b, v25.8b
+; CHECK-NEXT: and v21.8b, v1.8b, v24.8b
+; CHECK-NEXT: movi v23.2s, #32, lsl #24
+; CHECK-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT: mul v5.2s, v0.2s, v16.2s
+; CHECK-NEXT: and v16.8b, v1.8b, v22.8b
+; CHECK-NEXT: eor v7.8b, v7.8b, v20.8b
+; CHECK-NEXT: movi v22.2s, #16, lsl #24
+; CHECK-NEXT: movi v24.2s, #64, lsl #24
+; CHECK-NEXT: and v20.8b, v1.8b, v26.8b
+; CHECK-NEXT: mul v17.2s, v0.2s, v17.2s
+; CHECK-NEXT: mul v19.2s, v0.2s, v19.2s
+; CHECK-NEXT: mul v16.2s, v0.2s, v16.2s
+; CHECK-NEXT: eor v2.8b, v2.8b, v6.8b
+; CHECK-NEXT: mul v6.2s, v0.2s, v21.2s
+; CHECK-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT: eor v5.8b, v7.8b, v18.8b
+; CHECK-NEXT: movi v7.2s, #128, lsl #24
+; CHECK-NEXT: mul v18.2s, v0.2s, v20.2s
+; CHECK-NEXT: and v20.8b, v1.8b, v22.8b
+; CHECK-NEXT: and v21.8b, v1.8b, v23.8b
+; CHECK-NEXT: and v22.8b, v1.8b, v24.8b
+; CHECK-NEXT: eor v3.8b, v3.8b, v17.8b
+; CHECK-NEXT: eor v4.8b, v4.8b, v16.8b
+; CHECK-NEXT: eor v5.8b, v5.8b, v19.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v7.8b
+; CHECK-NEXT: mul v7.2s, v0.2s, v20.2s
+; CHECK-NEXT: mul v16.2s, v0.2s, v21.2s
+; CHECK-NEXT: mul v17.2s, v0.2s, v22.2s
+; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT: eor v3.8b, v4.8b, v6.8b
+; CHECK-NEXT: eor v4.8b, v5.8b, v18.8b
+; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: eor v1.8b, v2.8b, v3.8b
+; CHECK-NEXT: eor v2.8b, v4.8b, v7.8b
+; CHECK-NEXT: eor v3.8b, v16.8b, v17.8b
+; CHECK-NEXT: eor v1.8b, v1.8b, v2.8b
+; CHECK-NEXT: eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ret
+ %zextx = zext <2 x i16> %x to <2 x i32>
+ %zexty = zext <2 x i16> %y to <2 x i32>
+ %a = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %zextx, <2 x i32> %zexty)
+ ret <2 x i32> %a
+}
+
+define <2 x i64> @clmul_v2i64_neon_zext(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: clmul_v2i64_neon_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #2 // =0x2
+; CHECK-NEXT: mov w9, #4 // =0x4
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: dup v2.2d, x8
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: dup v4.2d, x9
+; CHECK-NEXT: dup v3.2d, x8
+; CHECK-NEXT: mov w8, #8 // =0x8
+; CHECK-NEXT: mov w9, #32 // =0x20
+; CHECK-NEXT: dup v5.2d, x8
+; CHECK-NEXT: mov w8, #16 // =0x10
+; CHECK-NEXT: dup v7.2d, x9
+; CHECK-NEXT: dup v6.2d, x8
+; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT: mov w8, #64 // =0x40
+; CHECK-NEXT: movi v24.2d, #0000000000000000
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT: dup v16.2d, x8
+; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
+; CHECK-NEXT: xtn v2.2s, v2.2d
+; CHECK-NEXT: mov w8, #128 // =0x80
+; CHECK-NEXT: xtn v3.2s, v3.2d
+; CHECK-NEXT: xtn v4.2s, v4.2d
+; CHECK-NEXT: dup v17.2d, x8
+; CHECK-NEXT: xtn v5.2s, v5.2d
+; CHECK-NEXT: mov w8, #256 // =0x100
+; CHECK-NEXT: xtn v7.2s, v7.2d
+; CHECK-NEXT: xtn v6.2s, v6.2d
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: dup v18.2d, x8
+; CHECK-NEXT: mov w8, #512 // =0x200
+; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: umull v3.2d, v0.2s, v3.2s
+; CHECK-NEXT: umull v4.2d, v0.2s, v4.2s
+; CHECK-NEXT: fmov v26.2d, #2.00000000
+; CHECK-NEXT: umull v5.2d, v0.2s, v5.2s
+; CHECK-NEXT: umull v7.2d, v0.2s, v7.2s
+; CHECK-NEXT: xtn v16.2s, v16.2d
+; CHECK-NEXT: umull v6.2d, v0.2s, v6.2s
+; CHECK-NEXT: fneg v24.2d, v24.2d
+; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: and v26.16b, v1.16b, v26.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
+; CHECK-NEXT: and v4.16b, v1.16b, v18.16b
+; CHECK-NEXT: dup v5.2d, x8
+; CHECK-NEXT: mov w8, #2048 // =0x800
+; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT: umull v7.2d, v0.2s, v16.2s
+; CHECK-NEXT: xtn v16.2s, v17.2d
+; CHECK-NEXT: dup v17.2d, x8
+; CHECK-NEXT: mov w8, #4096 // =0x1000
+; CHECK-NEXT: dup v18.2d, x8
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: xtn v4.2s, v4.2d
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: dup v3.2d, x8
+; CHECK-NEXT: mov w8, #8192 // =0x2000
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v18.16b
+; CHECK-NEXT: umull v16.2d, v0.2s, v16.2s
+; CHECK-NEXT: dup v18.2d, x8
+; CHECK-NEXT: mov w8, #65536 // =0x10000
+; CHECK-NEXT: xtn v5.2s, v5.2d
+; CHECK-NEXT: umull v4.2d, v0.2s, v4.2s
+; CHECK-NEXT: xtn v17.2s, v17.2d
+; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT: xtn v6.2s, v7.2d
+; CHECK-NEXT: dup v7.2d, x8
+; CHECK-NEXT: mov w8, #131072 // =0x20000
+; CHECK-NEXT: dup v19.2d, x8
+; CHECK-NEXT: mov w8, #16384 // =0x4000
+; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
+; CHECK-NEXT: umull v5.2d, v0.2s, v5.2s
+; CHECK-NEXT: eor v4.16b, v16.16b, v4.16b
+; CHECK-NEXT: xtn v3.2s, v3.2d
+; CHECK-NEXT: umull v16.2d, v0.2s, v17.2s
+; CHECK-NEXT: dup v17.2d, x8
+; CHECK-NEXT: mov w8, #262144 // =0x40000
+; CHECK-NEXT: umull v6.2d, v0.2s, v6.2s
+; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT: xtn v18.2s, v18.2d
+; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT: dup v5.2d, x8
+; CHECK-NEXT: mov w8, #32768 // =0x8000
+; CHECK-NEXT: xtn v7.2s, v7.2d
+; CHECK-NEXT: xtn v19.2s, v19.2d
+; CHECK-NEXT: umull v3.2d, v0.2s, v3.2s
+; CHECK-NEXT: eor v6.16b, v16.16b, v6.16b
+; CHECK-NEXT: dup v16.2d, x8
+; CHECK-NEXT: mov w8, #4194304 // =0x400000
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: dup v20.2d, x8
+; CHECK-NEXT: mov w8, #8388608 // =0x800000
+; CHECK-NEXT: umull v18.2d, v0.2s, v18.2s
+; CHECK-NEXT: dup v21.2d, x8
+; CHECK-NEXT: mov w8, #16777216 // =0x1000000
+; CHECK-NEXT: umull v7.2d, v0.2s, v7.2s
+; CHECK-NEXT: xtn v17.2s, v17.2d
+; CHECK-NEXT: umull v19.2d, v0.2s, v19.2s
+; CHECK-NEXT: xtn v5.2s, v5.2d
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: dup v4.2d, x8
+; CHECK-NEXT: mov w8, #524288 // =0x80000
+; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT: dup v22.2d, x8
+; CHECK-NEXT: mov w8, #1048576 // =0x100000
+; CHECK-NEXT: eor v6.16b, v6.16b, v18.16b
+; CHECK-NEXT: umull v17.2d, v0.2s, v17.2s
+; CHECK-NEXT: xtn v16.2s, v16.2d
+; CHECK-NEXT: eor v7.16b, v7.16b, v19.16b
+; CHECK-NEXT: umull v5.2d, v0.2s, v5.2s
+; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT: xtn v19.2s, v20.2d
+; CHECK-NEXT: xtn v20.2s, v21.2d
+; CHECK-NEXT: and v18.16b, v1.16b, v22.16b
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: movi v22.4s, #128, lsl #24
+; CHECK-NEXT: xtn v21.2s, v4.2d
+; CHECK-NEXT: eor v3.16b, v6.16b, v17.16b
+; CHECK-NEXT: dup v17.2d, x8
+; CHECK-NEXT: mov w8, #2097152 // =0x200000
+; CHECK-NEXT: umull v4.2d, v0.2s, v16.2s
+; CHECK-NEXT: eor v5.16b, v7.16b, v5.16b
+; CHECK-NEXT: umull v7.2d, v0.2s, v19.2s
+; CHECK-NEXT: umull v16.2d, v0.2s, v20.2s
+; CHECK-NEXT: xtn v6.2s, v18.2d
+; CHECK-NEXT: dup v18.2d, x8
+; CHECK-NEXT: mov w8, #33554432 // =0x2000000
+; CHECK-NEXT: fneg v22.2d, v22.2d
+; CHECK-NEXT: dup v20.2d, x8
+; CHECK-NEXT: mov w8, #67108864 // =0x4000000
+; CHECK-NEXT: umull v19.2d, v0.2s, v21.2s
+; CHECK-NEXT: dup v21.2d, x8
+; CHECK-NEXT: mov w8, #536870912 // =0x20000000
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: eor v7.16b, v7.16b, v16.16b
+; CHECK-NEXT: dup v16.2d, x8
+; CHECK-NEXT: mov w8, #1073741824 // =0x40000000
+; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT: dup v23.2d, x8
+; CHECK-NEXT: mov w8, #134217728 // =0x8000000
+; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT: xtn v17.2s, v17.2d
+; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
+; CHECK-NEXT: eor v7.16b, v7.16b, v19.16b
+; CHECK-NEXT: dup v19.2d, x8
+; CHECK-NEXT: mov w8, #268435456 // =0x10000000
+; CHECK-NEXT: xtn v20.2s, v20.2d
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
+; CHECK-NEXT: xtn v21.2s, v21.2d
+; CHECK-NEXT: dup v25.2d, x8
+; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
+; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT: umull v6.2d, v0.2s, v6.2s
+; CHECK-NEXT: xtn v18.2s, v18.2d
+; CHECK-NEXT: xtn v16.2s, v16.2d
+; CHECK-NEXT: xtn v23.2s, v23.2d
+; CHECK-NEXT: umull v17.2d, v0.2s, v17.2s
+; CHECK-NEXT: umull v20.2d, v0.2s, v20.2s
+; CHECK-NEXT: and v25.16b, v1.16b, v25.16b
+; CHECK-NEXT: xtn v22.2s, v22.2d
+; CHECK-NEXT: xtn v19.2s, v19.2d
+; CHECK-NEXT: umull v21.2d, v0.2s, v21.2s
+; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: eor v4.16b, v5.16b, v6.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v24.16b
+; CHECK-NEXT: umull v18.2d, v0.2s, v18.2s
+; CHECK-NEXT: umull v16.2d, v0.2s, v16.2s
+; CHECK-NEXT: umull v23.2d, v0.2s, v23.2s
+; CHECK-NEXT: xtn v6.2s, v25.2d
+; CHECK-NEXT: eor v5.16b, v7.16b, v20.16b
+; CHECK-NEXT: xtn v7.2s, v26.2d
+; CHECK-NEXT: umull v20.2d, v0.2s, v22.2s
+; CHECK-NEXT: umull v19.2d, v0.2s, v19.2s
+; CHECK-NEXT: eor v4.16b, v4.16b, v17.16b
+; CHECK-NEXT: xtn v1.2s, v1.2d
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v5.16b, v5.16b, v21.16b
+; CHECK-NEXT: eor v16.16b, v16.16b, v23.16b
+; CHECK-NEXT: umull v3.2d, v0.2s, v6.2s
+; CHECK-NEXT: umull v6.2d, v0.2s, v7.2s
+; CHECK-NEXT: eor v4.16b, v4.16b, v18.16b
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: eor v5.16b, v5.16b, v19.16b
+; CHECK-NEXT: eor v7.16b, v16.16b, v20.16b
+; CHECK-NEXT: eor v1.16b, v2.16b, v4.16b
+; CHECK-NEXT: eor v2.16b, v5.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v7.16b, v6.16b
+; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %zextx = zext <2 x i32> %x to <2 x i64>
+ %zexty = zext <2 x i32> %y to <2 x i64>
+ %a = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %zextx, <2 x i64> %zexty)
+ ret <2 x i64> %a
+}
+
+; TODO: Fix
+; define <1 x i64> @clmul_v1i64_neon_zext(<1 x i32> %x, <1 x i32> %y) {
+; %zextx = zext <1 x i32> %x to <1 x i64>
+; %zexty = zext <1 x i32> %y to <1 x i64>
+; %a = call <1 x i64> @llvm.clmul.v1i64(<1 x i64> %zextx, <1 x i64> %zexty)
+; ret <1 x i64> %a
+; }
+
+define <2 x i64> @clmul_v2i64_neon_aes_zext(<2 x i32> %x, <2 x i32> %y) #0 {
+; CHECK-LABEL: clmul_v2i64_neon_aes_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #2 // =0x2
+; CHECK-NEXT: mov w9, #4 // =0x4
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: dup v2.2d, x8
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: dup v4.2d, x9
+; CHECK-NEXT: dup v3.2d, x8
+; CHECK-NEXT: mov w8, #8 // =0x8
+; CHECK-NEXT: mov w9, #32 // =0x20
+; CHECK-NEXT: dup v5.2d, x8
+; CHECK-NEXT: mov w8, #16 // =0x10
+; CHECK-NEXT: dup v7.2d, x9
+; CHECK-NEXT: dup v6.2d, x8
+; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT: mov w8, #64 // =0x40
+; CHECK-NEXT: movi v24.2d, #0000000000000000
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT: dup v16.2d, x8
+; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
+; CHECK-NEXT: xtn v2.2s, v2.2d
+; CHECK-NEXT: mov w8, #128 // =0x80
+; CHECK-NEXT: xtn v3.2s, v3.2d
+; CHECK-NEXT: xtn v4.2s, v4.2d
+; CHECK-NEXT: dup v17.2d, x8
+; CHECK-NEXT: xtn v5.2s, v5.2d
+; CHECK-NEXT: mov w8, #256 // =0x100
+; CHECK-NEXT: xtn v7.2s, v7.2d
+; CHECK-NEXT: xtn v6.2s, v6.2d
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: dup v18.2d, x8
+; CHECK-NEXT: mov w8, #512 // =0x200
+; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: umull v3.2d, v0.2s, v3.2s
+; CHECK-NEXT: umull v4.2d, v0.2s, v4.2s
+; CHECK-NEXT: fmov v26.2d, #2.00000000
+; CHECK-NEXT: umull v5.2d, v0.2s, v5.2s
+; CHECK-NEXT: umull v7.2d, v0.2s, v7.2s
+; CHECK-NEXT: xtn v16.2s, v16.2d
+; CHECK-NEXT: umull v6.2d, v0.2s, v6.2s
+; CHECK-NEXT: fneg v24.2d, v24.2d
+; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: and v26.16b, v1.16b, v26.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
+; CHECK-NEXT: and v4.16b, v1.16b, v18.16b
+; CHECK-NEXT: dup v5.2d, x8
+; CHECK-NEXT: mov w8, #2048 // =0x800
+; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT: umull v7.2d, v0.2s, v16.2s
+; CHECK-NEXT: xtn v16.2s, v17.2d
+; CHECK-NEXT: dup v17.2d, x8
+; CHECK-NEXT: mov w8, #4096 // =0x1000
+; CHECK-NEXT: dup v18.2d, x8
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: xtn v4.2s, v4.2d
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: dup v3.2d, x8
+; CHECK-NEXT: mov w8, #8192 // =0x2000
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v18.16b
+; CHECK-NEXT: umull v16.2d, v0.2s, v16.2s
+; CHECK-NEXT: dup v18.2d, x8
+; CHECK-NEXT: mov w8, #65536 // =0x10000
+; CHECK-NEXT: xtn v5.2s, v5.2d
+; CHECK-NEXT: umull v4.2d, v0.2s, v4.2s
+; CHECK-NEXT: xtn v17.2s, v17.2d
+; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT: xtn v6.2s, v7.2d
+; CHECK-NEXT: dup v7.2d, x8
+; CHECK-NEXT: mov w8, #131072 // =0x20000
+; CHECK-NEXT: dup v19.2d, x8
+; CHECK-NEXT: mov w8, #16384 // =0x4000
+; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
+; CHECK-NEXT: umull v5.2d, v0.2s, v5.2s
+; CHECK-NEXT: eor v4.16b, v16.16b, v4.16b
+; CHECK-NEXT: xtn v3.2s, v3.2d
+; CHECK-NEXT: umull v16.2d, v0.2s, v17.2s
+; CHECK-NEXT: dup v17.2d, x8
+; CHECK-NEXT: mov w8, #262144 // =0x40000
+; CHECK-NEXT: umull v6.2d, v0.2s, v6.2s
+; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT: xtn v18.2s, v18.2d
+; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT: dup v5.2d, x8
+; CHECK-NEXT: mov w8, #32768 // =0x8000
+; CHECK-NEXT: xtn v7.2s, v7.2d
+; CHECK-NEXT: xtn v19.2s, v19.2d
+; CHECK-NEXT: umull v3.2d, v0.2s, v3.2s
+; CHECK-NEXT: eor v6.16b, v16.16b, v6.16b
+; CHECK-NEXT: dup v16.2d, x8
+; CHECK-NEXT: mov w8, #4194304 // =0x400000
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: dup v20.2d, x8
+; CHECK-NEXT: mov w8, #8388608 // =0x800000
+; CHECK-NEXT: umull v18.2d, v0.2s, v18.2s
+; CHECK-NEXT: dup v21.2d, x8
+; CHECK-NEXT: mov w8, #16777216 // =0x1000000
+; CHECK-NEXT: umull v7.2d, v0.2s, v7.2s
+; CHECK-NEXT: xtn v17.2s, v17.2d
+; CHECK-NEXT: umull v19.2d, v0.2s, v19.2s
+; CHECK-NEXT: xtn v5.2s, v5.2d
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: dup v4.2d, x8
+; CHECK-NEXT: mov w8, #524288 // =0x80000
+; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT: dup v22.2d, x8
+; CHECK-NEXT: mov w8, #1048576 // =0x100000
+; CHECK-NEXT: eor v6.16b, v6.16b, v18.16b
+; CHECK-NEXT: umull v17.2d, v0.2s, v17.2s
+; CHECK-NEXT: xtn v16.2s, v16.2d
+; CHECK-NEXT: eor v7.16b, v7.16b, v19.16b
+; CHECK-NEXT: umull v5.2d, v0.2s, v5.2s
+; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT: xtn v19.2s, v20.2d
+; CHECK-NEXT: xtn v20.2s, v21.2d
+; CHECK-NEXT: and v18.16b, v1.16b, v22.16b
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: movi v22.4s, #128, lsl #24
+; CHECK-NEXT: xtn v21.2s, v4.2d
+; CHECK-NEXT: eor v3.16b, v6.16b, v17.16b
+; CHECK-NEXT: dup v17.2d, x8
+; CHECK-NEXT: mov w8, #2097152 // =0x200000
+; CHECK-NEXT: umull v4.2d, v0.2s, v16.2s
+; CHECK-NEXT: eor v5.16b, v7.16b, v5.16b
+; CHECK-NEXT: umull v7.2d, v0.2s, v19.2s
+; CHECK-NEXT: umull v16.2d, v0.2s, v20.2s
+; CHECK-NEXT: xtn v6.2s, v18.2d
+; CHECK-NEXT: dup v18.2d, x8
+; CHECK-NEXT: mov w8, #33554432 // =0x2000000
+; CHECK-NEXT: fneg v22.2d, v22.2d
+; CHECK-NEXT: dup v20.2d, x8
+; CHECK-NEXT: mov w8, #67108864 // =0x4000000
+; CHECK-NEXT: umull v19.2d, v0.2s, v21.2s
+; CHECK-NEXT: dup v21.2d, x8
+; CHECK-NEXT: mov w8, #536870912 // =0x20000000
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: eor v7.16b, v7.16b, v16.16b
+; CHECK-NEXT: dup v16.2d, x8
+; CHECK-NEXT: mov w8, #1073741824 // =0x40000000
+; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT: dup v23.2d, x8
+; CHECK-NEXT: mov w8, #134217728 // =0x8000000
+; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT: xtn v17.2s, v17.2d
+; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
+; CHECK-NEXT: eor v7.16b, v7.16b, v19.16b
+; CHECK-NEXT: dup v19.2d, x8
+; CHECK-NEXT: mov w8, #268435456 // =0x10000000
+; CHECK-NEXT: xtn v20.2s, v20.2d
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
+; CHECK-NEXT: xtn v21.2s, v21.2d
+; CHECK-NEXT: dup v25.2d, x8
+; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
+; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT: umull v6.2d, v0.2s, v6.2s
+; CHECK-NEXT: xtn v18.2s, v18.2d
+; CHECK-NEXT: xtn v16.2s, v16.2d
+; CHECK-NEXT: xtn v23.2s, v23.2d
+; CHECK-NEXT: umull v17.2d, v0.2s, v17.2s
+; CHECK-NEXT: umull v20.2d, v0.2s, v20.2s
+; CHECK-NEXT: and v25.16b, v1.16b, v25.16b
+; CHECK-NEXT: xtn v22.2s, v22.2d
+; CHECK-NEXT: xtn v19.2s, v19.2d
+; CHECK-NEXT: umull v21.2d, v0.2s, v21.2s
+; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: eor v4.16b, v5.16b, v6.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v24.16b
+; CHECK-NEXT: umull v18.2d, v0.2s, v18.2s
+; CHECK-NEXT: umull v16.2d, v0.2s, v16.2s
+; CHECK-NEXT: umull v23.2d, v0.2s, v23.2s
+; CHECK-NEXT: xtn v6.2s, v25.2d
+; CHECK-NEXT: eor v5.16b, v7.16b, v20.16b
+; CHECK-NEXT: xtn v7.2s, v26.2d
+; CHECK-NEXT: umull v20.2d, v0.2s, v22.2s
+; CHECK-NEXT: umull v19.2d, v0.2s, v19.2s
+; CHECK-NEXT: eor v4.16b, v4.16b, v17.16b
+; CHECK-NEXT: xtn v1.2s, v1.2d
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v5.16b, v5.16b, v21.16b
+; CHECK-NEXT: eor v16.16b, v16.16b, v23.16b
+; CHECK-NEXT: umull v3.2d, v0.2s, v6.2s
+; CHECK-NEXT: umull v6.2d, v0.2s, v7.2s
+; CHECK-NEXT: eor v4.16b, v4.16b, v18.16b
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: eor v5.16b, v5.16b, v19.16b
+; CHECK-NEXT: eor v7.16b, v16.16b, v20.16b
+; CHECK-NEXT: eor v1.16b, v2.16b, v4.16b
+; CHECK-NEXT: eor v2.16b, v5.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v7.16b, v6.16b
+; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %zextx = zext <2 x i32> %x to <2 x i64>
+ %zexty = zext <2 x i32> %y to <2 x i64>
+ %a = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %zextx, <2 x i64> %zexty)
+ ret <2 x i64> %a
+}
+
+; TODO: Fix: lower with PMULL and PMULL2 .d variants which require FEAT_PMULL
+; define <1 x i64> @clmul_v1i64_neon_aes_zext(<1 x i32> %x, <1 x i32> %y) #0 {
+; %zextx = zext <1 x i32> %x to <1 x i64>
+; %zexty = zext <1 x i32> %y to <1 x i64>
+; %a = call <1 x i64> @llvm.clmul.v1i64(<1 x i64> %zextx, <1 x i64> %zexty)
+; ret <1 x i64> %a
+; }
+
+attributes #0 = { "target-features"="+aes" }
diff --git a/llvm/test/CodeGen/AArch64/clmul-scalable.ll b/llvm/test/CodeGen/AArch64/clmul-scalable.ll
index 889bfccace0e8..eed564e0a612f 100644
--- a/llvm/test/CodeGen/AArch64/clmul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-scalable.ll
@@ -1167,6 +1167,819 @@ define <vscale x 2 x i64> @clmul_nxv2i64_sve2_aes(<vscale x 2 x i64> %x, <vscale
ret <vscale x 2 x i64> %a
}
+define <vscale x 16 x i8> @clmul_nxv16i8_sve_zext(<vscale x 16 x i4> %x, <vscale x 16 x i4> %y) #0 {
+; CHECK-LABEL: clmul_nxv16i8_sve_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: and z0.b, z0.b, #0xf
+; CHECK-NEXT: and z1.b, z1.b, #0x8
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: and z2.b, z2.b, #0x2
+; CHECK-NEXT: and z3.b, z3.b, #0x1
+; CHECK-NEXT: and z4.b, z4.b, #0x4
+; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b
+; CHECK-NEXT: mul z2.b, p0/m, z2.b, z0.b
+; CHECK-NEXT: mul z3.b, p0/m, z3.b, z0.b
+; CHECK-NEXT: mul z4.b, p0/m, z4.b, z0.b
+; CHECK-NEXT: mul z0.b, z0.b, #0
+; CHECK-NEXT: eor z2.d, z3.d, z2.d
+; CHECK-NEXT: eor z1.d, z4.d, z1.d
+; CHECK-NEXT: eor z3.d, z0.d, z0.d
+; CHECK-NEXT: eor z1.d, z2.d, z1.d
+; CHECK-NEXT: eor z2.d, z3.d, z0.d
+; CHECK-NEXT: eor z1.d, z1.d, z2.d
+; CHECK-NEXT: eor z0.d, z1.d, z0.d
+; CHECK-NEXT: ret
+ %zextx = zext <vscale x 16 x i4> %x to <vscale x 16 x i8>
+ %zexty = zext <vscale x 16 x i4> %y to <vscale x 16 x i8>
+ %a = call <vscale x 16 x i8> @llvm.clmul.nxv16i8(<vscale x 16 x i8> %zextx, <vscale x 16 x i8> %zexty)
+ ret <vscale x 16 x i8> %a
+}
+
+define <vscale x 8 x i16> @clmul_nxv8i16_sve_zext(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) #0 {
+; CHECK-LABEL: clmul_nxv8i16_sve_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: mov z7.d, z1.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: and z0.h, z0.h, #0xff
+; CHECK-NEXT: and z1.h, z1.h, #0x80
+; CHECK-NEXT: and z2.h, z2.h, #0x2
+; CHECK-NEXT: and z3.h, z3.h, #0x1
+; CHECK-NEXT: and z4.h, z4.h, #0x4
+; CHECK-NEXT: and z5.h, z5.h, #0x8
+; CHECK-NEXT: and z6.h, z6.h, #0x10
+; CHECK-NEXT: and z7.h, z7.h, #0x20
+; CHECK-NEXT: and z24.h, z24.h, #0x40
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mul z2.h, p0/m, z2.h, z0.h
+; CHECK-NEXT: mul z3.h, p0/m, z3.h, z0.h
+; CHECK-NEXT: mul z4.h, p0/m, z4.h, z0.h
+; CHECK-NEXT: mul z5.h, p0/m, z5.h, z0.h
+; CHECK-NEXT: mul z6.h, p0/m, z6.h, z0.h
+; CHECK-NEXT: mul z7.h, p0/m, z7.h, z0.h
+; CHECK-NEXT: mul z24.h, p0/m, z24.h, z0.h
+; CHECK-NEXT: mul z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT: mul z0.h, z0.h, #0
+; CHECK-NEXT: eor z2.d, z3.d, z2.d
+; CHECK-NEXT: eor z3.d, z4.d, z5.d
+; CHECK-NEXT: eor z4.d, z6.d, z7.d
+; CHECK-NEXT: eor z1.d, z1.d, z0.d
+; CHECK-NEXT: eor z5.d, z0.d, z0.d
+; CHECK-NEXT: eor z2.d, z2.d, z3.d
+; CHECK-NEXT: eor z3.d, z4.d, z24.d
+; CHECK-NEXT: eor z1.d, z1.d, z0.d
+; CHECK-NEXT: eor z4.d, z5.d, z0.d
+; CHECK-NEXT: eor z2.d, z2.d, z3.d
+; CHECK-NEXT: eor z1.d, z1.d, z0.d
+; CHECK-NEXT: eor z3.d, z4.d, z0.d
+; CHECK-NEXT: eor z1.d, z2.d, z1.d
+; CHECK-NEXT: eor z0.d, z3.d, z0.d
+; CHECK-NEXT: eor z0.d, z1.d, z0.d
+; CHECK-NEXT: ret
+ %zextx = zext <vscale x 8 x i8> %x to <vscale x 8 x i16>
+ %zexty = zext <vscale x 8 x i8> %y to <vscale x 8 x i16>
+ %a = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %zextx, <vscale x 8 x i16> %zexty)
+ ret <vscale x 8 x i16> %a
+}
+
+define <vscale x 4 x i32> @clmul_nxv4i32_sve_zext(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) #0 {
+; CHECK-LABEL: clmul_nxv4i32_sve_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: mov z7.d, z1.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: mov z25.d, z1.d
+; CHECK-NEXT: mov z26.d, z1.d
+; CHECK-NEXT: mov z27.d, z1.d
+; CHECK-NEXT: mov z28.d, z1.d
+; CHECK-NEXT: mov z29.d, z1.d
+; CHECK-NEXT: mov z30.d, z1.d
+; CHECK-NEXT: mov z31.d, z1.d
+; CHECK-NEXT: mov z8.d, z1.d
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: and z2.s, z2.s, #0x2
+; CHECK-NEXT: and z3.s, z3.s, #0x1
+; CHECK-NEXT: and z4.s, z4.s, #0x4
+; CHECK-NEXT: and z5.s, z5.s, #0x8
+; CHECK-NEXT: and z6.s, z6.s, #0x10
+; CHECK-NEXT: and z7.s, z7.s, #0x20
+; CHECK-NEXT: and z24.s, z24.s, #0x40
+; CHECK-NEXT: and z25.s, z25.s, #0x80
+; CHECK-NEXT: and z26.s, z26.s, #0x100
+; CHECK-NEXT: and z27.s, z27.s, #0x200
+; CHECK-NEXT: and z28.s, z28.s, #0x400
+; CHECK-NEXT: and z29.s, z29.s, #0x800
+; CHECK-NEXT: and z30.s, z30.s, #0x1000
+; CHECK-NEXT: and z31.s, z31.s, #0x2000
+; CHECK-NEXT: and z8.s, z8.s, #0x4000
+; CHECK-NEXT: and z1.s, z1.s, #0x8000
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mul z2.s, p0/m, z2.s, z0.s
+; CHECK-NEXT: mul z3.s, p0/m, z3.s, z0.s
+; CHECK-NEXT: mul z4.s, p0/m, z4.s, z0.s
+; CHECK-NEXT: mul z5.s, p0/m, z5.s, z0.s
+; CHECK-NEXT: mul z6.s, p0/m, z6.s, z0.s
+; CHECK-NEXT: mul z7.s, p0/m, z7.s, z0.s
+; CHECK-NEXT: mul z24.s, p0/m, z24.s, z0.s
+; CHECK-NEXT: mul z25.s, p0/m, z25.s, z0.s
+; CHECK-NEXT: mul z26.s, p0/m, z26.s, z0.s
+; CHECK-NEXT: mul z27.s, p0/m, z27.s, z0.s
+; CHECK-NEXT: eor z2.d, z3.d, z2.d
+; CHECK-NEXT: mul z28.s, p0/m, z28.s, z0.s
+; CHECK-NEXT: mul z29.s, p0/m, z29.s, z0.s
+; CHECK-NEXT: eor z3.d, z4.d, z5.d
+; CHECK-NEXT: mul z30.s, p0/m, z30.s, z0.s
+; CHECK-NEXT: mul z31.s, p0/m, z31.s, z0.s
+; CHECK-NEXT: eor z4.d, z6.d, z7.d
+; CHECK-NEXT: mul z8.s, p0/m, z8.s, z0.s
+; CHECK-NEXT: mul z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT: mul z0.s, z0.s, #0
+; CHECK-NEXT: eor z5.d, z25.d, z26.d
+; CHECK-NEXT: eor z2.d, z2.d, z3.d
+; CHECK-NEXT: eor z3.d, z4.d, z24.d
+; CHECK-NEXT: eor z6.d, z29.d, z30.d
+; CHECK-NEXT: eor z4.d, z5.d, z27.d
+; CHECK-NEXT: eor z7.d, z0.d, z0.d
+; CHECK-NEXT: eor z2.d, z2.d, z3.d
+; CHECK-NEXT: eor z5.d, z6.d, z31.d
+; CHECK-NEXT: eor z3.d, z4.d, z28.d
+; CHECK-NEXT: eor z6.d, z7.d, z0.d
+; CHECK-NEXT: eor z4.d, z5.d, z8.d
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: eor z2.d, z2.d, z3.d
+; CHECK-NEXT: eor z5.d, z6.d, z0.d
+; CHECK-NEXT: eor z1.d, z4.d, z1.d
+; CHECK-NEXT: eor z3.d, z5.d, z0.d
+; CHECK-NEXT: eor z1.d, z2.d, z1.d
+; CHECK-NEXT: eor z2.d, z3.d, z0.d
+; CHECK-NEXT: eor z1.d, z1.d, z2.d
+; CHECK-NEXT: eor z0.d, z2.d, z0.d
+; CHECK-NEXT: eor z0.d, z1.d, z0.d
+; CHECK-NEXT: eor z0.d, z0.d, z6.d
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %zextx = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+ %zexty = zext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+ %a = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %zextx, <vscale x 4 x i32> %zexty)
+ ret <vscale x 4 x i32> %a
+}
+
+define <vscale x 2 x i64> @clmul_nxv2i64_sve_zext(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) #0 {
+; CHECK-LABEL: clmul_nxv2i64_sve_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-16
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x98, 0x01, 0x1e, 0x22 // sp + 16 + 152 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x40, 0x1c // $d10 @ cfa - 24 * VG - 16
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x40, 0x1c // $d11 @ cfa - 32 * VG - 16
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x40, 0x1c // $d12 @ cfa - 40 * VG - 16
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x40, 0x1c // $d13 @ cfa - 48 * VG - 16
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: mov z29.d, z1.d
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov z30.d, z1.d
+; CHECK-NEXT: mov z10.d, z1.d
+; CHECK-NEXT: mov z11.d, z1.d
+; CHECK-NEXT: mov z12.d, z1.d
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: and z6.d, z6.d, #0x10
+; CHECK-NEXT: and z29.d, z29.d, #0x200
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: and z30.d, z30.d, #0x1000
+; CHECK-NEXT: and z10.d, z10.d, #0x4000
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: and z11.d, z11.d, #0x8000
+; CHECK-NEXT: and z12.d, z12.d, #0x10000
+; CHECK-NEXT: and z2.d, z2.d, #0x2
+; CHECK-NEXT: movprfx z24, z0
+; CHECK-NEXT: mul z24.d, p0/m, z24.d, z6.d
+; CHECK-NEXT: movprfx z6, z0
+; CHECK-NEXT: mul z6.d, p0/m, z6.d, z29.d
+; CHECK-NEXT: mov z29.d, z1.d
+; CHECK-NEXT: movprfx z9, z0
+; CHECK-NEXT: mul z9.d, p0/m, z9.d, z30.d
+; CHECK-NEXT: movprfx z30, z0
+; CHECK-NEXT: mul z30.d, p0/m, z30.d, z10.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: movprfx z10, z0
+; CHECK-NEXT: mul z10.d, p0/m, z10.d, z12.d
+; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d
+; CHECK-NEXT: mov z26.d, z1.d
+; CHECK-NEXT: and z29.d, z29.d, #0x800
+; CHECK-NEXT: mov z27.d, z1.d
+; CHECK-NEXT: mov z28.d, z1.d
+; CHECK-NEXT: mov z12.d, z1.d
+; CHECK-NEXT: mov z15.d, z1.d
+; CHECK-NEXT: mov z16.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x1
+; CHECK-NEXT: and z4.d, z4.d, #0x4
+; CHECK-NEXT: and z5.d, z5.d, #0x8
+; CHECK-NEXT: movprfx z8, z0
+; CHECK-NEXT: mul z8.d, p0/m, z8.d, z29.d
+; CHECK-NEXT: movprfx z29, z0
+; CHECK-NEXT: mul z29.d, p0/m, z29.d, z11.d
+; CHECK-NEXT: str z2, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov z11.d, z1.d
+; CHECK-NEXT: and z26.d, z26.d, #0x40
+; CHECK-NEXT: and z27.d, z27.d, #0x80
+; CHECK-NEXT: mul z3.d, p0/m, z3.d, z0.d
+; CHECK-NEXT: mul z4.d, p0/m, z4.d, z0.d
+; CHECK-NEXT: movprfx z7, z0
+; CHECK-NEXT: mul z7.d, p0/m, z7.d, z5.d
+; CHECK-NEXT: and z28.d, z28.d, #0x100
+; CHECK-NEXT: and z12.d, z12.d, #0x40000
+; CHECK-NEXT: and z15.d, z15.d, #0x100000
+; CHECK-NEXT: movprfx z5, z0
+; CHECK-NEXT: mul z5.d, p0/m, z5.d, z26.d
+; CHECK-NEXT: movprfx z26, z0
+; CHECK-NEXT: mul z26.d, p0/m, z26.d, z27.d
+; CHECK-NEXT: and z11.d, z11.d, #0x20000
+; CHECK-NEXT: and z16.d, z16.d, #0x200000
+; CHECK-NEXT: mov z25.d, z1.d
+; CHECK-NEXT: mov z31.d, z1.d
+; CHECK-NEXT: movprfx z27, z0
+; CHECK-NEXT: mul z27.d, p0/m, z27.d, z28.d
+; CHECK-NEXT: movprfx z14, z0
+; CHECK-NEXT: mul z14.d, p0/m, z14.d, z12.d
+; CHECK-NEXT: movprfx z12, z0
+; CHECK-NEXT: mul z12.d, p0/m, z12.d, z15.d
+; CHECK-NEXT: movprfx z19, z0
+; CHECK-NEXT: mul z19.d, p0/m, z19.d, z11.d
+; CHECK-NEXT: str z3, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ldr z3, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: mov z28.d, z1.d
+; CHECK-NEXT: mov z13.d, z1.d
+; CHECK-NEXT: movprfx z11, z0
+; CHECK-NEXT: mul z11.d, p0/m, z11.d, z16.d
+; CHECK-NEXT: mov z17.d, z1.d
+; CHECK-NEXT: mov z18.d, z1.d
+; CHECK-NEXT: str z4, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: mov z15.d, z1.d
+; CHECK-NEXT: mov z16.d, z1.d
+; CHECK-NEXT: ldr z4, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: mov z20.d, z1.d
+; CHECK-NEXT: mov z21.d, z1.d
+; CHECK-NEXT: mov z22.d, z1.d
+; CHECK-NEXT: mov z23.d, z1.d
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: and z25.d, z25.d, #0x20
+; CHECK-NEXT: and z28.d, z28.d, #0x400
+; CHECK-NEXT: and z31.d, z31.d, #0x2000
+; CHECK-NEXT: and z13.d, z13.d, #0x80000
+; CHECK-NEXT: and z17.d, z17.d, #0x400000
+; CHECK-NEXT: and z18.d, z18.d, #0x800000
+; CHECK-NEXT: and z15.d, z15.d, #0x1000000
+; CHECK-NEXT: mul z25.d, p0/m, z25.d, z0.d
+; CHECK-NEXT: and z16.d, z16.d, #0x2000000
+; CHECK-NEXT: and z20.d, z20.d, #0x4000000
+; CHECK-NEXT: mul z28.d, p0/m, z28.d, z0.d
+; CHECK-NEXT: mul z31.d, p0/m, z31.d, z0.d
+; CHECK-NEXT: and z21.d, z21.d, #0x8000000
+; CHECK-NEXT: mul z13.d, p0/m, z13.d, z0.d
+; CHECK-NEXT: mul z17.d, p0/m, z17.d, z0.d
+; CHECK-NEXT: and z22.d, z22.d, #0x10000000
+; CHECK-NEXT: mul z18.d, p0/m, z18.d, z0.d
+; CHECK-NEXT: mul z15.d, p0/m, z15.d, z0.d
+; CHECK-NEXT: and z23.d, z23.d, #0x20000000
+; CHECK-NEXT: and z2.d, z2.d, #0x40000000
+; CHECK-NEXT: mul z16.d, p0/m, z16.d, z0.d
+; CHECK-NEXT: mul z20.d, p0/m, z20.d, z0.d
+; CHECK-NEXT: and z1.d, z1.d, #0x80000000
+; CHECK-NEXT: mul z21.d, p0/m, z21.d, z0.d
+; CHECK-NEXT: mul z22.d, p0/m, z22.d, z0.d
+; CHECK-NEXT: mul z23.d, p0/m, z23.d, z0.d
+; CHECK-NEXT: eor z3.d, z4.d, z3.d
+; CHECK-NEXT: ldr z4, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d
+; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT: mul z0.d, z0.d, #0
+; CHECK-NEXT: eor z4.d, z4.d, z7.d
+; CHECK-NEXT: eor z7.d, z24.d, z25.d
+; CHECK-NEXT: eor z24.d, z26.d, z27.d
+; CHECK-NEXT: eor z25.d, z8.d, z9.d
+; CHECK-NEXT: eor z26.d, z10.d, z19.d
+; CHECK-NEXT: eor z27.d, z17.d, z18.d
+; CHECK-NEXT: eor z2.d, z23.d, z2.d
+; CHECK-NEXT: eor z8.d, z0.d, z0.d
+; CHECK-NEXT: eor z3.d, z3.d, z4.d
+; CHECK-NEXT: eor z4.d, z7.d, z5.d
+; CHECK-NEXT: eor z5.d, z24.d, z6.d
+; CHECK-NEXT: eor z6.d, z25.d, z31.d
+; CHECK-NEXT: eor z7.d, z26.d, z14.d
+; CHECK-NEXT: eor z24.d, z27.d, z15.d
+; CHECK-NEXT: eor z1.d, z2.d, z1.d
+; CHECK-NEXT: eor z2.d, z8.d, z0.d
+; CHECK-NEXT: eor z3.d, z3.d, z4.d
+; CHECK-NEXT: eor z4.d, z5.d, z28.d
+; CHECK-NEXT: eor z5.d, z6.d, z30.d
+; CHECK-NEXT: eor z6.d, z7.d, z13.d
+; CHECK-NEXT: eor z7.d, z24.d, z16.d
+; CHECK-NEXT: eor z1.d, z1.d, z0.d
+; CHECK-NEXT: eor z2.d, z2.d, z0.d
+; CHECK-NEXT: eor z3.d, z3.d, z4.d
+; CHECK-NEXT: eor z4.d, z5.d, z29.d
+; CHECK-NEXT: eor z5.d, z6.d, z12.d
+; CHECK-NEXT: eor z6.d, z7.d, z20.d
+; CHECK-NEXT: eor z1.d, z1.d, z0.d
+; CHECK-NEXT: eor z2.d, z2.d, z0.d
+; CHECK-NEXT: eor z3.d, z3.d, z4.d
+; CHECK-NEXT: eor z4.d, z5.d, z11.d
+; CHECK-NEXT: eor z5.d, z6.d, z21.d
+; CHECK-NEXT: eor z1.d, z1.d, z0.d
+; CHECK-NEXT: eor z2.d, z2.d, z0.d
+; CHECK-NEXT: eor z3.d, z3.d, z4.d
+; CHECK-NEXT: eor z4.d, z5.d, z22.d
+; CHECK-NEXT: eor z1.d, z1.d, z0.d
+; CHECK-NEXT: eor z2.d, z2.d, z0.d
+; CHECK-NEXT: eor z3.d, z3.d, z4.d
+; CHECK-NEXT: eor z1.d, z1.d, z0.d
+; CHECK-NEXT: eor z2.d, z2.d, z0.d
+; CHECK-NEXT: eor z1.d, z3.d, z1.d
+; CHECK-NEXT: eor z3.d, z2.d, z0.d
+; CHECK-NEXT: eor z1.d, z1.d, z3.d
+; CHECK-NEXT: eor z0.d, z3.d, z0.d
+; CHECK-NEXT: eor z0.d, z1.d, z0.d
+; CHECK-NEXT: eor z0.d, z0.d, z2.d
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #16
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %zextx = zext <vscale x 2 x i32> %x to <vscale x 2 x i64>
+ %zexty = zext <vscale x 2 x i32> %y to <vscale x 2 x i64>
+ %a = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %zextx, <vscale x 2 x i64> %zexty)
+ ret <vscale x 2 x i64> %a
+}
+
+define <vscale x 16 x i8> @clmul_nxv16i8_sve2_zext(<vscale x 16 x i4> %x, <vscale x 16 x i4> %y) #1 {
+; CHECK-LABEL: clmul_nxv16i8_sve2_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.b, z0.b, #0xf
+; CHECK-NEXT: and z1.b, z1.b, #0xf
+; CHECK-NEXT: pmul z0.b, z0.b, z1.b
+; CHECK-NEXT: ret
+ %zextx = zext <vscale x 16 x i4> %x to <vscale x 16 x i8>
+ %zexty = zext <vscale x 16 x i4> %y to <vscale x 16 x i8>
+ %a = call <vscale x 16 x i8> @llvm.clmul.nxv16i8(<vscale x 16 x i8> %zextx, <vscale x 16 x i8> %zexty)
+ ret <vscale x 16 x i8> %a
+}
+
+define <vscale x 8 x i16> @clmul_nxv8i16_sve2_zext(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) #1 {
+; CHECK-LABEL: clmul_nxv8i16_sve2_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: and z0.h, z0.h, #0xff
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z2.h, z2.h, #0x2
+; CHECK-NEXT: and z3.h, z3.h, #0x1
+; CHECK-NEXT: and z4.h, z4.h, #0x8
+; CHECK-NEXT: and z5.h, z5.h, #0x4
+; CHECK-NEXT: and z6.h, z6.h, #0x20
+; CHECK-NEXT: mul z2.h, z0.h, z2.h
+; CHECK-NEXT: mul z3.h, z0.h, z3.h
+; CHECK-NEXT: mul z4.h, z0.h, z4.h
+; CHECK-NEXT: mul z5.h, z0.h, z5.h
+; CHECK-NEXT: eor z2.d, z3.d, z2.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: and z3.h, z3.h, #0x10
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z4.h, z0.h, z6.h
+; CHECK-NEXT: and z1.h, z1.h, #0x40
+; CHECK-NEXT: mul z3.h, z0.h, z3.h
+; CHECK-NEXT: and z5.h, z5.h, #0x80
+; CHECK-NEXT: mul z1.h, z0.h, z1.h
+; CHECK-NEXT: eor3 z2.d, z2.d, z3.d, z4.d
+; CHECK-NEXT: mul z3.h, z0.h, z5.h
+; CHECK-NEXT: mul z0.h, z0.h, #0
+; CHECK-NEXT: eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+ %zextx = zext <vscale x 8 x i8> %x to <vscale x 8 x i16>
+ %zexty = zext <vscale x 8 x i8> %y to <vscale x 8 x i16>
+ %a = call <vscale x 8 x i16> @llvm.clmul.nxv8i16(<vscale x 8 x i16> %zextx, <vscale x 8 x i16> %zexty)
+ ret <vscale x 8 x i16> %a
+}
+
+define <vscale x 4 x i32> @clmul_nxv4i32_sve2_zext(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) #1 {
+; CHECK-LABEL: clmul_nxv4i32_sve2_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z2.s, z2.s, #0x2
+; CHECK-NEXT: and z3.s, z3.s, #0x1
+; CHECK-NEXT: and z4.s, z4.s, #0x8
+; CHECK-NEXT: and z5.s, z5.s, #0x4
+; CHECK-NEXT: and z6.s, z6.s, #0x20
+; CHECK-NEXT: mul z2.s, z0.s, z2.s
+; CHECK-NEXT: mul z3.s, z0.s, z3.s
+; CHECK-NEXT: mul z4.s, z0.s, z4.s
+; CHECK-NEXT: mul z5.s, z0.s, z5.s
+; CHECK-NEXT: mul z6.s, z0.s, z6.s
+; CHECK-NEXT: eor z2.d, z3.d, z2.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: and z3.s, z3.s, #0x10
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.s, z0.s, z3.s
+; CHECK-NEXT: and z4.s, z4.s, #0x80
+; CHECK-NEXT: and z5.s, z5.s, #0x40
+; CHECK-NEXT: mul z4.s, z0.s, z4.s
+; CHECK-NEXT: mul z5.s, z0.s, z5.s
+; CHECK-NEXT: eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.s, z3.s, #0x200
+; CHECK-NEXT: and z6.s, z6.s, #0x100
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.s, z0.s, z3.s
+; CHECK-NEXT: mul z6.s, z0.s, z6.s
+; CHECK-NEXT: and z4.s, z4.s, #0x800
+; CHECK-NEXT: and z5.s, z5.s, #0x400
+; CHECK-NEXT: mul z4.s, z0.s, z4.s
+; CHECK-NEXT: mul z5.s, z0.s, z5.s
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.s, z3.s, #0x2000
+; CHECK-NEXT: and z6.s, z6.s, #0x1000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: and z1.s, z1.s, #0x4000
+; CHECK-NEXT: mul z3.s, z0.s, z3.s
+; CHECK-NEXT: mul z4.s, z0.s, z6.s
+; CHECK-NEXT: and z5.s, z5.s, #0x8000
+; CHECK-NEXT: mul z1.s, z0.s, z1.s
+; CHECK-NEXT: eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-NEXT: mul z3.s, z0.s, z5.s
+; CHECK-NEXT: mul z0.s, z0.s, #0
+; CHECK-NEXT: eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+ %zextx = zext <vscale x 4 x i16> %x to <vscale x 4 x i32>
+ %zexty = zext <vscale x 4 x i16> %y to <vscale x 4 x i32>
+ %a = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %zextx, <vscale x 4 x i32> %zexty)
+ ret <vscale x 4 x i32> %a
+}
+
+define <vscale x 2 x i64> @clmul_nxv2i64_sve2_zext(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) #1 {
+; CHECK-LABEL: clmul_nxv2i64_sve2_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z2.d, z2.d, #0x2
+; CHECK-NEXT: and z3.d, z3.d, #0x1
+; CHECK-NEXT: and z4.d, z4.d, #0x8
+; CHECK-NEXT: and z5.d, z5.d, #0x4
+; CHECK-NEXT: and z6.d, z6.d, #0x20
+; CHECK-NEXT: mul z2.d, z0.d, z2.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: eor z2.d, z3.d, z2.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x10
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: and z4.d, z4.d, #0x80
+; CHECK-NEXT: and z5.d, z5.d, #0x40
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x200
+; CHECK-NEXT: and z6.d, z6.d, #0x100
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x800
+; CHECK-NEXT: and z5.d, z5.d, #0x400
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x2000
+; CHECK-NEXT: and z6.d, z6.d, #0x1000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x8000
+; CHECK-NEXT: and z5.d, z5.d, #0x4000
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x20000
+; CHECK-NEXT: and z6.d, z6.d, #0x10000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x80000
+; CHECK-NEXT: and z5.d, z5.d, #0x40000
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x200000
+; CHECK-NEXT: and z6.d, z6.d, #0x100000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x800000
+; CHECK-NEXT: and z5.d, z5.d, #0x400000
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x2000000
+; CHECK-NEXT: and z6.d, z6.d, #0x1000000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x8000000
+; CHECK-NEXT: and z5.d, z5.d, #0x4000000
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x20000000
+; CHECK-NEXT: and z6.d, z6.d, #0x10000000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: and z1.d, z1.d, #0x40000000
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z4.d, z0.d, z6.d
+; CHECK-NEXT: and z5.d, z5.d, #0x80000000
+; CHECK-NEXT: mul z1.d, z0.d, z1.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-NEXT: mul z3.d, z0.d, z5.d
+; CHECK-NEXT: mul z0.d, z0.d, #0
+; CHECK-NEXT: eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+ %zextx = zext <vscale x 2 x i32> %x to <vscale x 2 x i64>
+ %zexty = zext <vscale x 2 x i32> %y to <vscale x 2 x i64>
+ %a = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %zextx, <vscale x 2 x i64> %zexty)
+ ret <vscale x 2 x i64> %a
+}
+
+define <vscale x 2 x i64> @clmul_nxv2i64_sve2_aes_zext(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) #2 {
+; CHECK-LABEL: clmul_nxv2i64_sve2_aes_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z2.d, z2.d, #0x2
+; CHECK-NEXT: and z3.d, z3.d, #0x1
+; CHECK-NEXT: and z4.d, z4.d, #0x8
+; CHECK-NEXT: and z5.d, z5.d, #0x4
+; CHECK-NEXT: and z6.d, z6.d, #0x20
+; CHECK-NEXT: mul z2.d, z0.d, z2.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: eor z2.d, z3.d, z2.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x10
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: and z4.d, z4.d, #0x80
+; CHECK-NEXT: and z5.d, z5.d, #0x40
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z3.d, z6.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x200
+; CHECK-NEXT: and z6.d, z6.d, #0x100
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x800
+; CHECK-NEXT: and z5.d, z5.d, #0x400
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x2000
+; CHECK-NEXT: and z6.d, z6.d, #0x1000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x8000
+; CHECK-NEXT: and z5.d, z5.d, #0x4000
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x20000
+; CHECK-NEXT: and z6.d, z6.d, #0x10000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x80000
+; CHECK-NEXT: and z5.d, z5.d, #0x40000
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x200000
+; CHECK-NEXT: and z6.d, z6.d, #0x100000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x800000
+; CHECK-NEXT: and z5.d, z5.d, #0x400000
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x2000000
+; CHECK-NEXT: and z6.d, z6.d, #0x1000000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z6.d, z0.d, z6.d
+; CHECK-NEXT: and z4.d, z4.d, #0x8000000
+; CHECK-NEXT: and z5.d, z5.d, #0x4000000
+; CHECK-NEXT: mul z4.d, z0.d, z4.d
+; CHECK-NEXT: mul z5.d, z0.d, z5.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z6.d, z3.d
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z6.d, z1.d
+; CHECK-NEXT: and z3.d, z3.d, #0x20000000
+; CHECK-NEXT: and z6.d, z6.d, #0x10000000
+; CHECK-NEXT: eor3 z2.d, z2.d, z5.d, z4.d
+; CHECK-NEXT: mov z5.d, z1.d
+; CHECK-NEXT: and z1.d, z1.d, #0x40000000
+; CHECK-NEXT: mul z3.d, z0.d, z3.d
+; CHECK-NEXT: mul z4.d, z0.d, z6.d
+; CHECK-NEXT: and z5.d, z5.d, #0x80000000
+; CHECK-NEXT: mul z1.d, z0.d, z1.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z4.d, z3.d
+; CHECK-NEXT: mul z3.d, z0.d, z5.d
+; CHECK-NEXT: mul z0.d, z0.d, #0
+; CHECK-NEXT: eor3 z2.d, z2.d, z1.d, z3.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: eor3 z2.d, z2.d, z0.d, z0.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+ %zextx = zext <vscale x 2 x i32> %x to <vscale x 2 x i64>
+ %zexty = zext <vscale x 2 x i32> %y to <vscale x 2 x i64>
+ %a = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %zextx, <vscale x 2 x i64> %zexty)
+ ret <vscale x 2 x i64> %a
+}
+
attributes #0 = { "target-features"="+sve" }
attributes #1 = { "target-features"="+sve2" }
attributes #2 = { "target-features"="+sve2,+sve2-aes" }
diff --git a/llvm/test/CodeGen/AArch64/clmul.ll b/llvm/test/CodeGen/AArch64/clmul.ll
index a7260523ae802..6d19e09ca3370 100644
--- a/llvm/test/CodeGen/AArch64/clmul.ll
+++ b/llvm/test/CodeGen/AArch64/clmul.ll
@@ -465,3 +465,223 @@ define i64 @clmul_i64(i64 %x, i64 %y) {
%a = call i64 @llvm.clmul.i64(i64 %x, i64 %y)
ret i64 %a
}
+
+define i8 @clmul_i8_zext(i4 %x, i4 %y) {
+; CHECK-LABEL: clmul_i8_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w0, #0xf
+; CHECK-NEXT: and w9, w1, #0x2
+; CHECK-NEXT: and w10, w1, #0x1
+; CHECK-NEXT: and w11, w1, #0x4
+; CHECK-NEXT: and w12, w1, #0x8
+; CHECK-NEXT: mul w9, w8, w9
+; CHECK-NEXT: mul w10, w8, w10
+; CHECK-NEXT: mul w11, w8, w11
+; CHECK-NEXT: mul w8, w8, w12
+; CHECK-NEXT: eor w9, w10, w9
+; CHECK-NEXT: eor w8, w11, w8
+; CHECK-NEXT: eor w0, w9, w8
+; CHECK-NEXT: ret
+ %zextx = zext i4 %x to i8
+ %zexty = zext i4 %y to i8
+ %a = call i8 @llvm.clmul.i8(i8 %zextx, i8 %zexty)
+ ret i8 %a
+}
+
+define i16 @clmul_i16_zext(i8 %x, i8 %y) {
+; CHECK-LABEL: clmul_i16_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w0, #0xff
+; CHECK-NEXT: and w9, w1, #0x2
+; CHECK-NEXT: and w10, w1, #0x1
+; CHECK-NEXT: mul w9, w8, w9
+; CHECK-NEXT: and w11, w1, #0x4
+; CHECK-NEXT: and w12, w1, #0x8
+; CHECK-NEXT: mul w10, w8, w10
+; CHECK-NEXT: and w13, w1, #0x10
+; CHECK-NEXT: and w14, w1, #0x20
+; CHECK-NEXT: mul w11, w8, w11
+; CHECK-NEXT: and w15, w1, #0x40
+; CHECK-NEXT: mul w12, w8, w12
+; CHECK-NEXT: mul w13, w8, w13
+; CHECK-NEXT: eor w9, w10, w9
+; CHECK-NEXT: and w10, w1, #0x80
+; CHECK-NEXT: mul w14, w8, w14
+; CHECK-NEXT: mul w15, w8, w15
+; CHECK-NEXT: eor w11, w11, w12
+; CHECK-NEXT: mul w8, w8, w10
+; CHECK-NEXT: eor w9, w9, w11
+; CHECK-NEXT: eor w12, w13, w14
+; CHECK-NEXT: eor w10, w12, w15
+; CHECK-NEXT: eor w9, w9, w10
+; CHECK-NEXT: eor w0, w9, w8
+; CHECK-NEXT: ret
+ %zextx = zext i8 %x to i16
+ %zexty = zext i8 %y to i16
+ %a = call i16 @llvm.clmul.i16(i16 %zextx, i16 %zexty)
+ ret i16 %a
+}
+
+define i32 @clmul_i32_zext(i16 %x, i16 %y) {
+; CHECK-LABEL: clmul_i32_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w0, #0xffff
+; CHECK-NEXT: and w9, w1, #0x2
+; CHECK-NEXT: and w10, w1, #0x1
+; CHECK-NEXT: mul w9, w8, w9
+; CHECK-NEXT: and w11, w1, #0x4
+; CHECK-NEXT: and w12, w1, #0x8
+; CHECK-NEXT: mul w10, w8, w10
+; CHECK-NEXT: and w13, w1, #0x10
+; CHECK-NEXT: and w14, w1, #0x20
+; CHECK-NEXT: mul w11, w8, w11
+; CHECK-NEXT: and w16, w1, #0x80
+; CHECK-NEXT: and w17, w1, #0x100
+; CHECK-NEXT: mul w12, w8, w12
+; CHECK-NEXT: and w2, w1, #0x800
+; CHECK-NEXT: and w15, w1, #0x40
+; CHECK-NEXT: mul w13, w8, w13
+; CHECK-NEXT: eor w9, w10, w9
+; CHECK-NEXT: and w10, w1, #0x1000
+; CHECK-NEXT: mul w14, w8, w14
+; CHECK-NEXT: and w18, w1, #0x200
+; CHECK-NEXT: and w0, w1, #0x400
+; CHECK-NEXT: mul w16, w8, w16
+; CHECK-NEXT: eor w11, w11, w12
+; CHECK-NEXT: and w12, w1, #0x2000
+; CHECK-NEXT: mul w17, w8, w17
+; CHECK-NEXT: eor w9, w9, w11
+; CHECK-NEXT: and w11, w1, #0x4000
+; CHECK-NEXT: mul w2, w8, w2
+; CHECK-NEXT: eor w13, w13, w14
+; CHECK-NEXT: and w14, w1, #0x8000
+; CHECK-NEXT: mul w10, w8, w10
+; CHECK-NEXT: mul w15, w8, w15
+; CHECK-NEXT: eor w16, w16, w17
+; CHECK-NEXT: mul w18, w8, w18
+; CHECK-NEXT: mul w12, w8, w12
+; CHECK-NEXT: eor w10, w2, w10
+; CHECK-NEXT: mul w0, w8, w0
+; CHECK-NEXT: eor w13, w13, w15
+; CHECK-NEXT: mul w11, w8, w11
+; CHECK-NEXT: eor w9, w9, w13
+; CHECK-NEXT: mul w8, w8, w14
+; CHECK-NEXT: eor w14, w16, w18
+; CHECK-NEXT: eor w10, w10, w12
+; CHECK-NEXT: eor w12, w14, w0
+; CHECK-NEXT: eor w10, w10, w11
+; CHECK-NEXT: eor w9, w9, w12
+; CHECK-NEXT: eor w8, w10, w8
+; CHECK-NEXT: eor w0, w9, w8
+; CHECK-NEXT: ret
+ %zextx = zext i16 %x to i32
+ %zexty = zext i16 %y to i32
+ %a = call i32 @llvm.clmul.i32(i32 %zextx, i32 %zexty)
+ ret i32 %a
+}
+
+define i64 @clmul_i64_zext(i32 %x, i32 %y) {
+; CHECK-LABEL: clmul_i64_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w1
+; CHECK-NEXT: and x9, x8, #0x2
+; CHECK-NEXT: and x10, x8, #0x1
+; CHECK-NEXT: and x11, x8, #0x4
+; CHECK-NEXT: umull x9, w9, w0
+; CHECK-NEXT: and x12, x8, #0x8
+; CHECK-NEXT: and x13, x8, #0x10
+; CHECK-NEXT: umull x10, w10, w0
+; CHECK-NEXT: and x14, x8, #0x20
+; CHECK-NEXT: and x15, x8, #0x40
+; CHECK-NEXT: umull x11, w11, w0
+; CHECK-NEXT: and x2, x8, #0x800
+; CHECK-NEXT: and x16, x8, #0x80
+; CHECK-NEXT: umull x12, w12, w0
+; CHECK-NEXT: and x17, x8, #0x100
+; CHECK-NEXT: and x18, x8, #0x200
+; CHECK-NEXT: umull x13, w13, w0
+; CHECK-NEXT: eor x9, x10, x9
+; CHECK-NEXT: and x10, x8, #0x1000
+; CHECK-NEXT: umull x14, w14, w0
+; CHECK-NEXT: and x1, x8, #0x400
+; CHECK-NEXT: umull x15, w15, w0
+; CHECK-NEXT: eor x11, x11, x12
+; CHECK-NEXT: and x12, x8, #0x2000
+; CHECK-NEXT: umull x2, w2, w0
+; CHECK-NEXT: eor x9, x9, x11
+; CHECK-NEXT: and x11, x8, #0x4000
+; CHECK-NEXT: umull x10, w10, w0
+; CHECK-NEXT: eor x13, x13, x14
+; CHECK-NEXT: and x14, x8, #0x8000
+; CHECK-NEXT: umull x16, w16, w0
+; CHECK-NEXT: eor x13, x13, x15
+; CHECK-NEXT: and x15, x8, #0x10000
+; CHECK-NEXT: umull x17, w17, w0
+; CHECK-NEXT: eor x9, x9, x13
+; CHECK-NEXT: and x13, x8, #0x20000
+; CHECK-NEXT: umull x12, w12, w0
+; CHECK-NEXT: eor x10, x2, x10
+; CHECK-NEXT: and x2, x8, #0x400000
+; CHECK-NEXT: umull x18, w18, w0
+; CHECK-NEXT: umull x11, w11, w0
+; CHECK-NEXT: eor x16, x16, x17
+; CHECK-NEXT: and x17, x8, #0x40000
+; CHECK-NEXT: umull x14, w14, w0
+; CHECK-NEXT: eor x10, x10, x12
+; CHECK-NEXT: and x12, x8, #0x800000
+; CHECK-NEXT: umull x1, w1, w0
+; CHECK-NEXT: eor x16, x16, x18
+; CHECK-NEXT: and x18, x8, #0x80000
+; CHECK-NEXT: umull x15, w15, w0
+; CHECK-NEXT: eor x10, x10, x11
+; CHECK-NEXT: and x11, x8, #0x1000000
+; CHECK-NEXT: umull x13, w13, w0
+; CHECK-NEXT: eor x10, x10, x14
+; CHECK-NEXT: and x14, x8, #0x2000000
+; CHECK-NEXT: umull x17, w17, w0
+; CHECK-NEXT: eor x16, x16, x1
+; CHECK-NEXT: and x1, x8, #0x100000
+; CHECK-NEXT: umull x2, w2, w0
+; CHECK-NEXT: eor x9, x9, x16
+; CHECK-NEXT: and x16, x8, #0x200000
+; CHECK-NEXT: umull x12, w12, w0
+; CHECK-NEXT: eor x13, x15, x13
+; CHECK-NEXT: and x15, x8, #0x4000000
+; CHECK-NEXT: umull x18, w18, w0
+; CHECK-NEXT: eor x13, x13, x17
+; CHECK-NEXT: and x17, x8, #0x8000000
+; CHECK-NEXT: umull x11, w11, w0
+; CHECK-NEXT: eor x9, x9, x10
+; CHECK-NEXT: umull x14, w14, w0
+; CHECK-NEXT: eor x12, x2, x12
+; CHECK-NEXT: and x2, x8, #0x20000000
+; CHECK-NEXT: umull x1, w1, w0
+; CHECK-NEXT: eor x13, x13, x18
+; CHECK-NEXT: and x18, x8, #0x10000000
+; CHECK-NEXT: umull x15, w15, w0
+; CHECK-NEXT: eor x11, x12, x11
+; CHECK-NEXT: and x12, x8, #0x40000000
+; CHECK-NEXT: umull x16, w16, w0
+; CHECK-NEXT: eor x11, x11, x14
+; CHECK-NEXT: and x8, x8, #0x80000000
+; CHECK-NEXT: umull x17, w17, w0
+; CHECK-NEXT: eor x13, x13, x1
+; CHECK-NEXT: umull x18, w18, w0
+; CHECK-NEXT: eor x11, x11, x15
+; CHECK-NEXT: umull x2, w2, w0
+; CHECK-NEXT: eor x10, x13, x16
+; CHECK-NEXT: umull x12, w12, w0
+; CHECK-NEXT: eor x11, x11, x17
+; CHECK-NEXT: eor x9, x9, x10
+; CHECK-NEXT: umull x8, w8, w0
+; CHECK-NEXT: eor x10, x11, x18
+; CHECK-NEXT: eor x9, x9, x10
+; CHECK-NEXT: eor x11, x2, x12
+; CHECK-NEXT: eor x8, x11, x8
+; CHECK-NEXT: eor x0, x9, x8
+; CHECK-NEXT: ret
+ %zextx = zext i32 %x to i64
+ %zexty = zext i32 %y to i64
+ %a = call i64 @llvm.clmul.i64(i64 %zextx, i64 %zexty)
+ ret i64 %a
+}
More information about the llvm-commits
mailing list