[llvm] [AArch64] Add clmul/h/r v16i8/v8i16/v4i32/v2i64 test coverage (PR #182305)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 19 07:50:31 PST 2026
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/182305
Some of the v16i8/v2i64 tests are currently disabled due to #182270 and #182039
>From d08846f4e41a32075cf5a600f1e01a4859a469ed Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 19 Feb 2026 15:47:20 +0000
Subject: [PATCH] [AArch64] Add clmul/h/r v16i8/v8i16/v4i32/v2i64 test coverage
Some of the v16i8/v2i64 tests are currently disabled due to #182270 and #182039
---
llvm/test/CodeGen/AArch64/clmul-vector.ll | 807 ++++++++++++++++++++++
1 file changed, 807 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/clmul-vector.ll
diff --git a/llvm/test/CodeGen/AArch64/clmul-vector.ll b/llvm/test/CodeGen/AArch64/clmul-vector.ll
new file mode 100644
index 0000000000000..03eedd71d670f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/clmul-vector.ll
@@ -0,0 +1,807 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+
+define <16 x i8> @clmul_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: clmul_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pmul v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %res = call <16 x i8> @llvm.clmul.v16i8(<16 x i8> %a, <16 x i8> %b)
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @clmul_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: clmul_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.8h, #2
+; CHECK-NEXT: movi v3.8h, #1
+; CHECK-NEXT: movi v4.8h, #4
+; CHECK-NEXT: movi v5.8h, #8
+; CHECK-NEXT: movi v6.8h, #16
+; CHECK-NEXT: movi v7.8h, #32
+; CHECK-NEXT: movi v16.8h, #128
+; CHECK-NEXT: movi v17.8h, #1, lsl #8
+; CHECK-NEXT: movi v18.8h, #8, lsl #8
+; CHECK-NEXT: movi v19.8h, #16, lsl #8
+; CHECK-NEXT: movi v20.8h, #64
+; CHECK-NEXT: movi v21.8h, #2, lsl #8
+; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: movi v22.8h, #32, lsl #8
+; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
+; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT: mul v2.8h, v0.8h, v2.8h
+; CHECK-NEXT: mul v3.8h, v0.8h, v3.8h
+; CHECK-NEXT: mul v4.8h, v0.8h, v4.8h
+; CHECK-NEXT: mul v5.8h, v0.8h, v5.8h
+; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT: movi v23.8h, #4, lsl #8
+; CHECK-NEXT: movi v24.8h, #64, lsl #8
+; CHECK-NEXT: mul v6.8h, v0.8h, v6.8h
+; CHECK-NEXT: mul v7.8h, v0.8h, v7.8h
+; CHECK-NEXT: mul v16.8h, v0.8h, v16.8h
+; CHECK-NEXT: mul v17.8h, v0.8h, v17.8h
+; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT: mul v18.8h, v0.8h, v18.8h
+; CHECK-NEXT: mul v19.8h, v0.8h, v19.8h
+; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
+; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
+; CHECK-NEXT: mul v4.8h, v0.8h, v20.8h
+; CHECK-NEXT: movi v20.8h, #128, lsl #8
+; CHECK-NEXT: mul v5.8h, v0.8h, v21.8h
+; CHECK-NEXT: and v21.16b, v1.16b, v23.16b
+; CHECK-NEXT: and v23.16b, v1.16b, v24.16b
+; CHECK-NEXT: mul v22.8h, v0.8h, v22.8h
+; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT: eor v7.16b, v16.16b, v17.16b
+; CHECK-NEXT: eor v16.16b, v18.16b, v19.16b
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v20.16b
+; CHECK-NEXT: mul v3.8h, v0.8h, v21.8h
+; CHECK-NEXT: mul v17.8h, v0.8h, v23.8h
+; CHECK-NEXT: eor v4.16b, v6.16b, v4.16b
+; CHECK-NEXT: eor v5.16b, v7.16b, v5.16b
+; CHECK-NEXT: eor v6.16b, v16.16b, v22.16b
+; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: eor v1.16b, v2.16b, v4.16b
+; CHECK-NEXT: eor v2.16b, v5.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v6.16b, v17.16b
+; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %res = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b)
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @clmul_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: clmul_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.4s, #2
+; CHECK-NEXT: movi v3.4s, #1
+; CHECK-NEXT: movi v4.4s, #4
+; CHECK-NEXT: movi v5.4s, #8
+; CHECK-NEXT: movi v6.4s, #16
+; CHECK-NEXT: movi v7.4s, #32
+; CHECK-NEXT: movi v16.4s, #64
+; CHECK-NEXT: movi v17.4s, #128
+; CHECK-NEXT: movi v18.4s, #1, lsl #8
+; CHECK-NEXT: movi v19.4s, #2, lsl #8
+; CHECK-NEXT: movi v20.4s, #8, lsl #8
+; CHECK-NEXT: movi v21.4s, #128, lsl #16
+; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
+; CHECK-NEXT: mul v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT: mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT: mul v5.4s, v0.4s, v5.4s
+; CHECK-NEXT: mul v6.4s, v0.4s, v6.4s
+; CHECK-NEXT: mul v7.4s, v0.4s, v7.4s
+; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT: movi v22.4s, #8, lsl #16
+; CHECK-NEXT: movi v23.4s, #2, lsl #24
+; CHECK-NEXT: movi v25.4s, #4, lsl #24
+; CHECK-NEXT: movi v24.4s, #32, lsl #16
+; CHECK-NEXT: movi v26.4s, #8, lsl #24
+; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
+; CHECK-NEXT: movi v4.4s, #16, lsl #8
+; CHECK-NEXT: mul v5.4s, v0.4s, v16.4s
+; CHECK-NEXT: mul v16.4s, v0.4s, v17.4s
+; CHECK-NEXT: mul v17.4s, v0.4s, v18.4s
+; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v19.16b
+; CHECK-NEXT: movi v19.4s, #32, lsl #8
+; CHECK-NEXT: and v18.16b, v1.16b, v20.16b
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: movi v20.4s, #64, lsl #8
+; CHECK-NEXT: mul v21.4s, v0.4s, v21.4s
+; CHECK-NEXT: and v3.16b, v1.16b, v4.16b
+; CHECK-NEXT: eor v5.16b, v6.16b, v5.16b
+; CHECK-NEXT: movi v4.4s, #1, lsl #16
+; CHECK-NEXT: eor v6.16b, v16.16b, v17.16b
+; CHECK-NEXT: movi v16.4s, #2, lsl #16
+; CHECK-NEXT: mul v7.4s, v0.4s, v7.4s
+; CHECK-NEXT: mul v18.4s, v0.4s, v18.4s
+; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT: movi v17.4s, #4, lsl #8
+; CHECK-NEXT: mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b
+; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
+; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v5.16b, v1.16b, v16.16b
+; CHECK-NEXT: movi v16.4s, #64, lsl #16
+; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT: mul v7.4s, v0.4s, v19.4s
+; CHECK-NEXT: movi v19.4s, #4, lsl #16
+; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT: eor v3.16b, v18.16b, v3.16b
+; CHECK-NEXT: and v18.16b, v1.16b, v20.16b
+; CHECK-NEXT: movi v20.4s, #1, lsl #24
+; CHECK-NEXT: mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT: mul v5.4s, v0.4s, v5.4s
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: mul v17.4s, v0.4s, v17.4s
+; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v19.16b
+; CHECK-NEXT: mul v18.4s, v0.4s, v18.4s
+; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT: movi v19.4s, #128, lsl #8
+; CHECK-NEXT: mul v16.4s, v0.4s, v16.4s
+; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT: mul v5.4s, v0.4s, v7.4s
+; CHECK-NEXT: and v7.16b, v1.16b, v22.16b
+; CHECK-NEXT: movi v22.4s, #16, lsl #16
+; CHECK-NEXT: mul v20.4s, v0.4s, v20.4s
+; CHECK-NEXT: eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT: eor v3.16b, v3.16b, v18.16b
+; CHECK-NEXT: and v17.16b, v1.16b, v19.16b
+; CHECK-NEXT: mul v18.4s, v0.4s, v23.4s
+; CHECK-NEXT: and v19.16b, v1.16b, v25.16b
+; CHECK-NEXT: eor v16.16b, v16.16b, v21.16b
+; CHECK-NEXT: and v21.16b, v1.16b, v24.16b
+; CHECK-NEXT: movi v23.4s, #32, lsl #24
+; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT: mul v5.4s, v0.4s, v7.4s
+; CHECK-NEXT: and v7.16b, v1.16b, v22.16b
+; CHECK-NEXT: movi v22.4s, #16, lsl #24
+; CHECK-NEXT: movi v24.4s, #64, lsl #24
+; CHECK-NEXT: mul v17.4s, v0.4s, v17.4s
+; CHECK-NEXT: eor v16.16b, v16.16b, v20.16b
+; CHECK-NEXT: and v20.16b, v1.16b, v26.16b
+; CHECK-NEXT: mul v19.4s, v0.4s, v19.4s
+; CHECK-NEXT: mul v7.4s, v0.4s, v7.4s
+; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT: mul v6.4s, v0.4s, v21.4s
+; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT: and v21.16b, v1.16b, v23.16b
+; CHECK-NEXT: eor v5.16b, v16.16b, v18.16b
+; CHECK-NEXT: movi v16.4s, #128, lsl #24
+; CHECK-NEXT: mul v18.4s, v0.4s, v20.4s
+; CHECK-NEXT: and v20.16b, v1.16b, v22.16b
+; CHECK-NEXT: and v22.16b, v1.16b, v24.16b
+; CHECK-NEXT: eor v3.16b, v3.16b, v17.16b
+; CHECK-NEXT: eor v4.16b, v4.16b, v7.16b
+; CHECK-NEXT: eor v5.16b, v5.16b, v19.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v16.16b
+; CHECK-NEXT: mul v7.4s, v0.4s, v20.4s
+; CHECK-NEXT: mul v16.4s, v0.4s, v21.4s
+; CHECK-NEXT: mul v17.4s, v0.4s, v22.4s
+; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v6.16b
+; CHECK-NEXT: eor v4.16b, v5.16b, v18.16b
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v7.16b
+; CHECK-NEXT: eor v3.16b, v16.16b, v17.16b
+; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %res = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %res
+}
+
+; TODO
+;define <2 x i64> @clmul_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; %res = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %a, <2 x i64> %b)
+; ret <2 x i64> %res
+;}
+
+; TODO
+;define <16 x i8> @clmulr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; %a.ext = zext <16 x i8> %a to <16 x i16>
+; %b.ext = zext <16 x i8> %b to <16 x i16>
+; %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
+; %res.ext = lshr <16 x i16> %clmul, splat (i16 7)
+; %res = trunc <16 x i16> %res.ext to <16 x i8>
+; ret <16 x i8> %res
+;}
+
+define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: clmulr_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.16b, #15
+; CHECK-NEXT: rev16 v1.16b, v1.16b
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: movi v7.8h, #1
+; CHECK-NEXT: movi v17.8h, #4
+; CHECK-NEXT: movi v18.8h, #8
+; CHECK-NEXT: movi v19.8h, #16
+; CHECK-NEXT: movi v20.8h, #32
+; CHECK-NEXT: movi v22.8h, #128
+; CHECK-NEXT: movi v23.8h, #1, lsl #8
+; CHECK-NEXT: movi v25.8h, #8, lsl #8
+; CHECK-NEXT: movi v26.8h, #16, lsl #8
+; CHECK-NEXT: ushr v3.8h, v1.8h, #4
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: ushr v4.8h, v0.8h, #4
+; CHECK-NEXT: and v5.16b, v0.16b, v2.16b
+; CHECK-NEXT: movi v0.16b, #51
+; CHECK-NEXT: movi v21.8h, #64
+; CHECK-NEXT: movi v27.8h, #32, lsl #8
+; CHECK-NEXT: movi v24.8h, #4, lsl #8
+; CHECK-NEXT: and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT: shl v1.8h, v1.8h, #4
+; CHECK-NEXT: and v4.16b, v4.16b, v2.16b
+; CHECK-NEXT: shl v5.8h, v5.8h, #4
+; CHECK-NEXT: orr v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: orr v3.16b, v4.16b, v5.16b
+; CHECK-NEXT: ushr v4.8h, v1.8h, #2
+; CHECK-NEXT: and v1.16b, v1.16b, v0.16b
+; CHECK-NEXT: ushr v5.8h, v3.8h, #2
+; CHECK-NEXT: and v3.16b, v3.16b, v0.16b
+; CHECK-NEXT: and v4.16b, v4.16b, v0.16b
+; CHECK-NEXT: shl v6.8h, v1.8h, #2
+; CHECK-NEXT: movi v1.16b, #85
+; CHECK-NEXT: and v5.16b, v5.16b, v0.16b
+; CHECK-NEXT: shl v3.8h, v3.8h, #2
+; CHECK-NEXT: orr v4.16b, v4.16b, v6.16b
+; CHECK-NEXT: movi v6.8h, #2
+; CHECK-NEXT: orr v3.16b, v5.16b, v3.16b
+; CHECK-NEXT: ushr v5.8h, v4.8h, #1
+; CHECK-NEXT: and v4.16b, v4.16b, v1.16b
+; CHECK-NEXT: ushr v16.8h, v3.8h, #1
+; CHECK-NEXT: and v3.16b, v3.16b, v1.16b
+; CHECK-NEXT: and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT: add v4.8h, v4.8h, v4.8h
+; CHECK-NEXT: and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT: add v3.8h, v3.8h, v3.8h
+; CHECK-NEXT: orr v4.16b, v5.16b, v4.16b
+; CHECK-NEXT: movi v5.8h, #2, lsl #8
+; CHECK-NEXT: orr v3.16b, v16.16b, v3.16b
+; CHECK-NEXT: movi v16.8h, #64, lsl #8
+; CHECK-NEXT: and v6.16b, v4.16b, v6.16b
+; CHECK-NEXT: and v7.16b, v4.16b, v7.16b
+; CHECK-NEXT: and v17.16b, v4.16b, v17.16b
+; CHECK-NEXT: and v18.16b, v4.16b, v18.16b
+; CHECK-NEXT: and v19.16b, v4.16b, v19.16b
+; CHECK-NEXT: and v20.16b, v4.16b, v20.16b
+; CHECK-NEXT: and v22.16b, v4.16b, v22.16b
+; CHECK-NEXT: and v23.16b, v4.16b, v23.16b
+; CHECK-NEXT: and v25.16b, v4.16b, v25.16b
+; CHECK-NEXT: and v26.16b, v4.16b, v26.16b
+; CHECK-NEXT: mul v6.8h, v3.8h, v6.8h
+; CHECK-NEXT: mul v7.8h, v3.8h, v7.8h
+; CHECK-NEXT: mul v17.8h, v3.8h, v17.8h
+; CHECK-NEXT: mul v18.8h, v3.8h, v18.8h
+; CHECK-NEXT: and v21.16b, v4.16b, v21.16b
+; CHECK-NEXT: and v5.16b, v4.16b, v5.16b
+; CHECK-NEXT: and v27.16b, v4.16b, v27.16b
+; CHECK-NEXT: mul v19.8h, v3.8h, v19.8h
+; CHECK-NEXT: mul v20.8h, v3.8h, v20.8h
+; CHECK-NEXT: mul v22.8h, v3.8h, v22.8h
+; CHECK-NEXT: mul v23.8h, v3.8h, v23.8h
+; CHECK-NEXT: mul v25.8h, v3.8h, v25.8h
+; CHECK-NEXT: mul v26.8h, v3.8h, v26.8h
+; CHECK-NEXT: eor v6.16b, v7.16b, v6.16b
+; CHECK-NEXT: movi v7.8h, #128, lsl #8
+; CHECK-NEXT: eor v17.16b, v17.16b, v18.16b
+; CHECK-NEXT: and v18.16b, v4.16b, v24.16b
+; CHECK-NEXT: mul v21.8h, v3.8h, v21.8h
+; CHECK-NEXT: mul v5.8h, v3.8h, v5.8h
+; CHECK-NEXT: mul v24.8h, v3.8h, v27.8h
+; CHECK-NEXT: and v16.16b, v4.16b, v16.16b
+; CHECK-NEXT: eor v19.16b, v19.16b, v20.16b
+; CHECK-NEXT: eor v20.16b, v22.16b, v23.16b
+; CHECK-NEXT: eor v22.16b, v25.16b, v26.16b
+; CHECK-NEXT: eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT: and v4.16b, v4.16b, v7.16b
+; CHECK-NEXT: mul v7.8h, v3.8h, v18.8h
+; CHECK-NEXT: mul v16.8h, v3.8h, v16.8h
+; CHECK-NEXT: eor v17.16b, v19.16b, v21.16b
+; CHECK-NEXT: eor v5.16b, v20.16b, v5.16b
+; CHECK-NEXT: eor v18.16b, v22.16b, v24.16b
+; CHECK-NEXT: mul v3.8h, v3.8h, v4.8h
+; CHECK-NEXT: eor v4.16b, v6.16b, v17.16b
+; CHECK-NEXT: eor v5.16b, v5.16b, v7.16b
+; CHECK-NEXT: eor v6.16b, v18.16b, v16.16b
+; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT: eor v3.16b, v6.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: rev16 v3.16b, v3.16b
+; CHECK-NEXT: ushr v4.8h, v3.8h, #4
+; CHECK-NEXT: and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: shl v3.8h, v3.8h, #4
+; CHECK-NEXT: orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: ushr v3.8h, v2.8h, #2
+; CHECK-NEXT: and v2.16b, v2.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: shl v2.8h, v2.8h, #2
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: ushr v2.8h, v0.8h, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT: add v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %a.ext = zext <8 x i16> %a to <8 x i32>
+ %b.ext = zext <8 x i16> %b to <8 x i32>
+ %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
+ %res.ext = lshr <8 x i32> %clmul, splat (i32 15)
+ %res = trunc <8 x i32> %res.ext to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: clmulr_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v1.16b, v1.16b
+; CHECK-NEXT: rev32 v2.16b, v0.16b
+; CHECK-NEXT: movi v3.4s, #2
+; CHECK-NEXT: movi v4.4s, #1
+; CHECK-NEXT: movi v5.4s, #4
+; CHECK-NEXT: movi v6.4s, #8
+; CHECK-NEXT: movi v7.4s, #16
+; CHECK-NEXT: movi v16.4s, #32
+; CHECK-NEXT: movi v17.4s, #64
+; CHECK-NEXT: movi v18.4s, #1, lsl #8
+; CHECK-NEXT: movi v19.4s, #2, lsl #8
+; CHECK-NEXT: movi v20.4s, #8, lsl #8
+; CHECK-NEXT: rbit v0.16b, v1.16b
+; CHECK-NEXT: rbit v1.16b, v2.16b
+; CHECK-NEXT: movi v2.4s, #128
+; CHECK-NEXT: movi v21.4s, #16, lsl #8
+; CHECK-NEXT: movi v22.4s, #8, lsl #16
+; CHECK-NEXT: movi v23.4s, #2, lsl #24
+; CHECK-NEXT: movi v25.4s, #4, lsl #24
+; CHECK-NEXT: movi v24.4s, #32, lsl #16
+; CHECK-NEXT: movi v26.4s, #8, lsl #24
+; CHECK-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEXT: and v4.16b, v0.16b, v4.16b
+; CHECK-NEXT: and v5.16b, v0.16b, v5.16b
+; CHECK-NEXT: and v6.16b, v0.16b, v6.16b
+; CHECK-NEXT: and v7.16b, v0.16b, v7.16b
+; CHECK-NEXT: and v16.16b, v0.16b, v16.16b
+; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: mul v4.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v5.4s, v1.4s, v5.4s
+; CHECK-NEXT: mul v6.4s, v1.4s, v6.4s
+; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT: mul v2.4s, v1.4s, v2.4s
+; CHECK-NEXT: and v23.16b, v0.16b, v23.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: eor v4.16b, v5.16b, v6.16b
+; CHECK-NEXT: eor v6.16b, v7.16b, v16.16b
+; CHECK-NEXT: mul v5.4s, v1.4s, v18.4s
+; CHECK-NEXT: and v7.16b, v0.16b, v19.16b
+; CHECK-NEXT: movi v18.4s, #32, lsl #8
+; CHECK-NEXT: and v16.16b, v0.16b, v20.16b
+; CHECK-NEXT: movi v19.4s, #1, lsl #16
+; CHECK-NEXT: movi v20.4s, #4, lsl #8
+; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: and v4.16b, v0.16b, v21.16b
+; CHECK-NEXT: eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT: movi v17.4s, #2, lsl #16
+; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT: eor v5.16b, v2.16b, v5.16b
+; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT: movi v21.4s, #64, lsl #8
+; CHECK-NEXT: mul v4.4s, v1.4s, v4.4s
+; CHECK-NEXT: eor v2.16b, v3.16b, v6.16b
+; CHECK-NEXT: and v3.16b, v0.16b, v19.16b
+; CHECK-NEXT: movi v19.4s, #128, lsl #16
+; CHECK-NEXT: and v20.16b, v0.16b, v20.16b
+; CHECK-NEXT: and v6.16b, v0.16b, v17.16b
+; CHECK-NEXT: movi v17.4s, #64, lsl #16
+; CHECK-NEXT: eor v5.16b, v5.16b, v7.16b
+; CHECK-NEXT: mul v7.4s, v1.4s, v18.4s
+; CHECK-NEXT: movi v18.4s, #4, lsl #16
+; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: eor v4.16b, v16.16b, v4.16b
+; CHECK-NEXT: and v16.16b, v0.16b, v21.16b
+; CHECK-NEXT: movi v21.4s, #1, lsl #24
+; CHECK-NEXT: and v19.16b, v0.16b, v19.16b
+; CHECK-NEXT: mul v6.4s, v1.4s, v6.4s
+; CHECK-NEXT: mul v20.4s, v1.4s, v20.4s
+; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT: eor v4.16b, v4.16b, v7.16b
+; CHECK-NEXT: and v7.16b, v0.16b, v18.16b
+; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT: mul v19.4s, v1.4s, v19.4s
+; CHECK-NEXT: and v21.16b, v0.16b, v21.16b
+; CHECK-NEXT: movi v18.4s, #128, lsl #8
+; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT: eor v5.16b, v5.16b, v20.16b
+; CHECK-NEXT: mul v6.4s, v1.4s, v7.4s
+; CHECK-NEXT: and v7.16b, v0.16b, v22.16b
+; CHECK-NEXT: movi v22.4s, #16, lsl #16
+; CHECK-NEXT: mul v21.4s, v1.4s, v21.4s
+; CHECK-NEXT: eor v4.16b, v4.16b, v16.16b
+; CHECK-NEXT: and v20.16b, v0.16b, v24.16b
+; CHECK-NEXT: movi v24.4s, #64, lsl #24
+; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b
+; CHECK-NEXT: eor v16.16b, v17.16b, v19.16b
+; CHECK-NEXT: and v17.16b, v0.16b, v18.16b
+; CHECK-NEXT: mul v18.4s, v1.4s, v23.4s
+; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT: mul v6.4s, v1.4s, v7.4s
+; CHECK-NEXT: and v7.16b, v0.16b, v22.16b
+; CHECK-NEXT: and v19.16b, v0.16b, v25.16b
+; CHECK-NEXT: movi v22.4s, #16, lsl #24
+; CHECK-NEXT: movi v23.4s, #32, lsl #24
+; CHECK-NEXT: eor v16.16b, v16.16b, v21.16b
+; CHECK-NEXT: and v21.16b, v0.16b, v26.16b
+; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT: mul v5.4s, v1.4s, v20.4s
+; CHECK-NEXT: mul v19.4s, v1.4s, v19.4s
+; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT: eor v6.16b, v16.16b, v18.16b
+; CHECK-NEXT: movi v16.4s, #128, lsl #24
+; CHECK-NEXT: mul v18.4s, v1.4s, v21.4s
+; CHECK-NEXT: and v20.16b, v0.16b, v22.16b
+; CHECK-NEXT: and v21.16b, v0.16b, v23.16b
+; CHECK-NEXT: and v22.16b, v0.16b, v24.16b
+; CHECK-NEXT: eor v4.16b, v4.16b, v17.16b
+; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b
+; CHECK-NEXT: eor v6.16b, v6.16b, v19.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT: mul v7.4s, v1.4s, v20.4s
+; CHECK-NEXT: mul v16.4s, v1.4s, v21.4s
+; CHECK-NEXT: mul v17.4s, v1.4s, v22.4s
+; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT: eor v3.16b, v3.16b, v5.16b
+; CHECK-NEXT: eor v4.16b, v6.16b, v18.16b
+; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v7.16b
+; CHECK-NEXT: eor v3.16b, v16.16b, v17.16b
+; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: rev32 v0.16b, v0.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: ret
+ %a.ext = zext <4 x i32> %a to <4 x i64>
+ %b.ext = zext <4 x i32> %b to <4 x i64>
+ %clmul = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %a.ext, <4 x i64> %b.ext)
+ %res.ext = lshr <4 x i64> %clmul, splat (i64 31)
+ %res = trunc <4 x i64> %res.ext to <4 x i32>
+ ret <4 x i32> %res
+}
+
+; TODO
+;define <2 x i64> @clmulr_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; %a.ext = zext <2 x i64> %a to <2 x i128>
+; %b.ext = zext <2 x i64> %b to <2 x i128>
+; %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+; %res.ext = lshr <2 x i128> %clmul, splat (i128 63)
+; %res = trunc <2 x i128> %res.ext to <2 x i64>
+; ret <2 x i64> %res
+;}
+
+; TODO
+;define <16 x i8> @clmulh_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; %a.ext = zext <16 x i8> %a to <16 x i16>
+; %b.ext = zext <16 x i8> %b to <16 x i16>
+; %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
+; %res.ext = lshr <16 x i16> %clmul, splat (i16 8)
+; %res = trunc <16 x i16> %res.ext to <16 x i8>
+; ret <16 x i8> %res
+;}
+
+define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: clmulh_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.16b, #15
+; CHECK-NEXT: rev16 v1.16b, v1.16b
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: movi v7.8h, #1
+; CHECK-NEXT: movi v17.8h, #4
+; CHECK-NEXT: movi v18.8h, #8
+; CHECK-NEXT: movi v19.8h, #16
+; CHECK-NEXT: movi v20.8h, #32
+; CHECK-NEXT: movi v22.8h, #128
+; CHECK-NEXT: movi v23.8h, #1, lsl #8
+; CHECK-NEXT: movi v25.8h, #8, lsl #8
+; CHECK-NEXT: movi v26.8h, #16, lsl #8
+; CHECK-NEXT: ushr v3.8h, v1.8h, #4
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: ushr v4.8h, v0.8h, #4
+; CHECK-NEXT: and v5.16b, v0.16b, v2.16b
+; CHECK-NEXT: movi v0.16b, #51
+; CHECK-NEXT: movi v21.8h, #64
+; CHECK-NEXT: movi v27.8h, #32, lsl #8
+; CHECK-NEXT: movi v24.8h, #4, lsl #8
+; CHECK-NEXT: and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT: shl v1.8h, v1.8h, #4
+; CHECK-NEXT: and v4.16b, v4.16b, v2.16b
+; CHECK-NEXT: shl v5.8h, v5.8h, #4
+; CHECK-NEXT: orr v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: orr v3.16b, v4.16b, v5.16b
+; CHECK-NEXT: ushr v4.8h, v1.8h, #2
+; CHECK-NEXT: and v1.16b, v1.16b, v0.16b
+; CHECK-NEXT: ushr v5.8h, v3.8h, #2
+; CHECK-NEXT: and v3.16b, v3.16b, v0.16b
+; CHECK-NEXT: and v4.16b, v4.16b, v0.16b
+; CHECK-NEXT: shl v6.8h, v1.8h, #2
+; CHECK-NEXT: movi v1.16b, #85
+; CHECK-NEXT: and v5.16b, v5.16b, v0.16b
+; CHECK-NEXT: shl v3.8h, v3.8h, #2
+; CHECK-NEXT: orr v4.16b, v4.16b, v6.16b
+; CHECK-NEXT: movi v6.8h, #2
+; CHECK-NEXT: orr v3.16b, v5.16b, v3.16b
+; CHECK-NEXT: ushr v5.8h, v4.8h, #1
+; CHECK-NEXT: and v4.16b, v4.16b, v1.16b
+; CHECK-NEXT: ushr v16.8h, v3.8h, #1
+; CHECK-NEXT: and v3.16b, v3.16b, v1.16b
+; CHECK-NEXT: and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT: add v4.8h, v4.8h, v4.8h
+; CHECK-NEXT: and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT: add v3.8h, v3.8h, v3.8h
+; CHECK-NEXT: orr v4.16b, v5.16b, v4.16b
+; CHECK-NEXT: movi v5.8h, #2, lsl #8
+; CHECK-NEXT: orr v3.16b, v16.16b, v3.16b
+; CHECK-NEXT: movi v16.8h, #64, lsl #8
+; CHECK-NEXT: and v6.16b, v4.16b, v6.16b
+; CHECK-NEXT: and v7.16b, v4.16b, v7.16b
+; CHECK-NEXT: and v17.16b, v4.16b, v17.16b
+; CHECK-NEXT: and v18.16b, v4.16b, v18.16b
+; CHECK-NEXT: and v19.16b, v4.16b, v19.16b
+; CHECK-NEXT: and v20.16b, v4.16b, v20.16b
+; CHECK-NEXT: and v22.16b, v4.16b, v22.16b
+; CHECK-NEXT: and v23.16b, v4.16b, v23.16b
+; CHECK-NEXT: and v25.16b, v4.16b, v25.16b
+; CHECK-NEXT: and v26.16b, v4.16b, v26.16b
+; CHECK-NEXT: mul v6.8h, v3.8h, v6.8h
+; CHECK-NEXT: mul v7.8h, v3.8h, v7.8h
+; CHECK-NEXT: mul v17.8h, v3.8h, v17.8h
+; CHECK-NEXT: mul v18.8h, v3.8h, v18.8h
+; CHECK-NEXT: and v21.16b, v4.16b, v21.16b
+; CHECK-NEXT: and v5.16b, v4.16b, v5.16b
+; CHECK-NEXT: and v27.16b, v4.16b, v27.16b
+; CHECK-NEXT: mul v19.8h, v3.8h, v19.8h
+; CHECK-NEXT: mul v20.8h, v3.8h, v20.8h
+; CHECK-NEXT: mul v22.8h, v3.8h, v22.8h
+; CHECK-NEXT: mul v23.8h, v3.8h, v23.8h
+; CHECK-NEXT: mul v25.8h, v3.8h, v25.8h
+; CHECK-NEXT: mul v26.8h, v3.8h, v26.8h
+; CHECK-NEXT: eor v6.16b, v7.16b, v6.16b
+; CHECK-NEXT: movi v7.8h, #128, lsl #8
+; CHECK-NEXT: eor v17.16b, v17.16b, v18.16b
+; CHECK-NEXT: and v18.16b, v4.16b, v24.16b
+; CHECK-NEXT: mul v21.8h, v3.8h, v21.8h
+; CHECK-NEXT: mul v5.8h, v3.8h, v5.8h
+; CHECK-NEXT: mul v24.8h, v3.8h, v27.8h
+; CHECK-NEXT: and v16.16b, v4.16b, v16.16b
+; CHECK-NEXT: eor v19.16b, v19.16b, v20.16b
+; CHECK-NEXT: eor v20.16b, v22.16b, v23.16b
+; CHECK-NEXT: eor v22.16b, v25.16b, v26.16b
+; CHECK-NEXT: eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT: and v4.16b, v4.16b, v7.16b
+; CHECK-NEXT: mul v7.8h, v3.8h, v18.8h
+; CHECK-NEXT: mul v16.8h, v3.8h, v16.8h
+; CHECK-NEXT: eor v17.16b, v19.16b, v21.16b
+; CHECK-NEXT: eor v5.16b, v20.16b, v5.16b
+; CHECK-NEXT: eor v18.16b, v22.16b, v24.16b
+; CHECK-NEXT: mul v3.8h, v3.8h, v4.8h
+; CHECK-NEXT: eor v4.16b, v6.16b, v17.16b
+; CHECK-NEXT: eor v5.16b, v5.16b, v7.16b
+; CHECK-NEXT: eor v6.16b, v18.16b, v16.16b
+; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT: eor v3.16b, v6.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: rev16 v3.16b, v3.16b
+; CHECK-NEXT: ushr v4.8h, v3.8h, #4
+; CHECK-NEXT: and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT: and v2.16b, v4.16b, v2.16b
+; CHECK-NEXT: shl v3.8h, v3.8h, #4
+; CHECK-NEXT: orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: ushr v3.8h, v2.8h, #2
+; CHECK-NEXT: and v2.16b, v2.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: shl v2.8h, v2.8h, #2
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: ushr v2.8h, v0.8h, #1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT: add v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ushr v0.8h, v0.8h, #1
+; CHECK-NEXT: ret
+ %a.ext = zext <8 x i16> %a to <8 x i32>
+ %b.ext = zext <8 x i16> %b to <8 x i32>
+ %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
+ %res.ext = lshr <8 x i32> %clmul, splat (i32 16)
+ %res = trunc <8x i32> %res.ext to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: clmulh_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v1.16b, v1.16b
+; CHECK-NEXT: rev32 v2.16b, v0.16b
+; CHECK-NEXT: movi v3.4s, #2
+; CHECK-NEXT: movi v4.4s, #1
+; CHECK-NEXT: movi v5.4s, #4
+; CHECK-NEXT: movi v6.4s, #8
+; CHECK-NEXT: movi v7.4s, #16
+; CHECK-NEXT: movi v16.4s, #32
+; CHECK-NEXT: movi v17.4s, #64
+; CHECK-NEXT: movi v18.4s, #1, lsl #8
+; CHECK-NEXT: movi v19.4s, #2, lsl #8
+; CHECK-NEXT: movi v20.4s, #8, lsl #8
+; CHECK-NEXT: rbit v0.16b, v1.16b
+; CHECK-NEXT: rbit v1.16b, v2.16b
+; CHECK-NEXT: movi v2.4s, #128
+; CHECK-NEXT: movi v21.4s, #16, lsl #8
+; CHECK-NEXT: movi v22.4s, #8, lsl #16
+; CHECK-NEXT: movi v23.4s, #2, lsl #24
+; CHECK-NEXT: movi v25.4s, #4, lsl #24
+; CHECK-NEXT: movi v24.4s, #32, lsl #16
+; CHECK-NEXT: movi v26.4s, #8, lsl #24
+; CHECK-NEXT: and v3.16b, v0.16b, v3.16b
+; CHECK-NEXT: and v4.16b, v0.16b, v4.16b
+; CHECK-NEXT: and v5.16b, v0.16b, v5.16b
+; CHECK-NEXT: and v6.16b, v0.16b, v6.16b
+; CHECK-NEXT: and v7.16b, v0.16b, v7.16b
+; CHECK-NEXT: and v16.16b, v0.16b, v16.16b
+; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: mul v4.4s, v1.4s, v4.4s
+; CHECK-NEXT: mul v5.4s, v1.4s, v5.4s
+; CHECK-NEXT: mul v6.4s, v1.4s, v6.4s
+; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT: mul v2.4s, v1.4s, v2.4s
+; CHECK-NEXT: and v23.16b, v0.16b, v23.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: eor v4.16b, v5.16b, v6.16b
+; CHECK-NEXT: eor v6.16b, v7.16b, v16.16b
+; CHECK-NEXT: mul v5.4s, v1.4s, v18.4s
+; CHECK-NEXT: and v7.16b, v0.16b, v19.16b
+; CHECK-NEXT: movi v18.4s, #32, lsl #8
+; CHECK-NEXT: and v16.16b, v0.16b, v20.16b
+; CHECK-NEXT: movi v19.4s, #1, lsl #16
+; CHECK-NEXT: movi v20.4s, #4, lsl #8
+; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: and v4.16b, v0.16b, v21.16b
+; CHECK-NEXT: eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT: movi v17.4s, #2, lsl #16
+; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT: eor v5.16b, v2.16b, v5.16b
+; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT: movi v21.4s, #64, lsl #8
+; CHECK-NEXT: mul v4.4s, v1.4s, v4.4s
+; CHECK-NEXT: eor v2.16b, v3.16b, v6.16b
+; CHECK-NEXT: and v3.16b, v0.16b, v19.16b
+; CHECK-NEXT: movi v19.4s, #128, lsl #16
+; CHECK-NEXT: and v20.16b, v0.16b, v20.16b
+; CHECK-NEXT: and v6.16b, v0.16b, v17.16b
+; CHECK-NEXT: movi v17.4s, #64, lsl #16
+; CHECK-NEXT: eor v5.16b, v5.16b, v7.16b
+; CHECK-NEXT: mul v7.4s, v1.4s, v18.4s
+; CHECK-NEXT: movi v18.4s, #4, lsl #16
+; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: eor v4.16b, v16.16b, v4.16b
+; CHECK-NEXT: and v16.16b, v0.16b, v21.16b
+; CHECK-NEXT: movi v21.4s, #1, lsl #24
+; CHECK-NEXT: and v19.16b, v0.16b, v19.16b
+; CHECK-NEXT: mul v6.4s, v1.4s, v6.4s
+; CHECK-NEXT: mul v20.4s, v1.4s, v20.4s
+; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT: eor v4.16b, v4.16b, v7.16b
+; CHECK-NEXT: and v7.16b, v0.16b, v18.16b
+; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT: mul v19.4s, v1.4s, v19.4s
+; CHECK-NEXT: and v21.16b, v0.16b, v21.16b
+; CHECK-NEXT: movi v18.4s, #128, lsl #8
+; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT: eor v5.16b, v5.16b, v20.16b
+; CHECK-NEXT: mul v6.4s, v1.4s, v7.4s
+; CHECK-NEXT: and v7.16b, v0.16b, v22.16b
+; CHECK-NEXT: movi v22.4s, #16, lsl #16
+; CHECK-NEXT: mul v21.4s, v1.4s, v21.4s
+; CHECK-NEXT: eor v4.16b, v4.16b, v16.16b
+; CHECK-NEXT: and v20.16b, v0.16b, v24.16b
+; CHECK-NEXT: movi v24.4s, #64, lsl #24
+; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b
+; CHECK-NEXT: eor v16.16b, v17.16b, v19.16b
+; CHECK-NEXT: and v17.16b, v0.16b, v18.16b
+; CHECK-NEXT: mul v18.4s, v1.4s, v23.4s
+; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT: mul v6.4s, v1.4s, v7.4s
+; CHECK-NEXT: and v7.16b, v0.16b, v22.16b
+; CHECK-NEXT: and v19.16b, v0.16b, v25.16b
+; CHECK-NEXT: movi v22.4s, #16, lsl #24
+; CHECK-NEXT: movi v23.4s, #32, lsl #24
+; CHECK-NEXT: eor v16.16b, v16.16b, v21.16b
+; CHECK-NEXT: and v21.16b, v0.16b, v26.16b
+; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT: mul v5.4s, v1.4s, v20.4s
+; CHECK-NEXT: mul v19.4s, v1.4s, v19.4s
+; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT: eor v6.16b, v16.16b, v18.16b
+; CHECK-NEXT: movi v16.4s, #128, lsl #24
+; CHECK-NEXT: mul v18.4s, v1.4s, v21.4s
+; CHECK-NEXT: and v20.16b, v0.16b, v22.16b
+; CHECK-NEXT: and v21.16b, v0.16b, v23.16b
+; CHECK-NEXT: and v22.16b, v0.16b, v24.16b
+; CHECK-NEXT: eor v4.16b, v4.16b, v17.16b
+; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b
+; CHECK-NEXT: eor v6.16b, v6.16b, v19.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT: mul v7.4s, v1.4s, v20.4s
+; CHECK-NEXT: mul v16.4s, v1.4s, v21.4s
+; CHECK-NEXT: mul v17.4s, v1.4s, v22.4s
+; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT: eor v3.16b, v3.16b, v5.16b
+; CHECK-NEXT: eor v4.16b, v6.16b, v18.16b
+; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v2.16b, v4.16b, v7.16b
+; CHECK-NEXT: eor v3.16b, v16.16b, v17.16b
+; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: rev32 v0.16b, v0.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: ushr v0.4s, v0.4s, #1
+; CHECK-NEXT: ret
+ %a.ext = zext <4 x i32> %a to <4 x i64>
+ %b.ext = zext <4 x i32> %b to <4 x i64>
+ %clmul = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %a.ext, <4 x i64> %b.ext)
+ %res.ext = lshr <4 x i64> %clmul, splat (i64 32)
+ %res = trunc <4 x i64> %res.ext to <4 x i32>
+ ret <4 x i32> %res
+}
+
+; TODO
+;define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+; %a.ext = zext <2 x i64> %a to <2 x i128>
+; %b.ext = zext <2 x i64> %b to <2 x i128>
+; %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+; %res.ext = lshr <2 x i128> %clmul, splat (i128 64)
+; %res = trunc <2 x i128> %res.ext to <2 x i64>
+; ret <2 x i64> %res
+;}
More information about the llvm-commits
mailing list