[llvm] [AArch64] Add clmul/h/r v16i8/v8i16/v4i32/v2i64 test coverage (PR #182305)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 19 07:50:31 PST 2026


https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/182305

Some of the v16i8/v2i64 tests are currently disabled due to #182270 and #182039

>From d08846f4e41a32075cf5a600f1e01a4859a469ed Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 19 Feb 2026 15:47:20 +0000
Subject: [PATCH] [AArch64] Add clmul/h/r v16i8/v8i16/v4i32/v2i64 test coverage

Some of the v16i8/v2i64 tests are currently disabled due to #182270 and #182039
---
 llvm/test/CodeGen/AArch64/clmul-vector.ll | 807 ++++++++++++++++++++++
 1 file changed, 807 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/clmul-vector.ll

diff --git a/llvm/test/CodeGen/AArch64/clmul-vector.ll b/llvm/test/CodeGen/AArch64/clmul-vector.ll
new file mode 100644
index 0000000000000..03eedd71d670f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/clmul-vector.ll
@@ -0,0 +1,807 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+
+define <16 x i8> @clmul_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: clmul_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pmul v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.clmul.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @clmul_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: clmul_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.8h, #2
+; CHECK-NEXT:    movi v3.8h, #1
+; CHECK-NEXT:    movi v4.8h, #4
+; CHECK-NEXT:    movi v5.8h, #8
+; CHECK-NEXT:    movi v6.8h, #16
+; CHECK-NEXT:    movi v7.8h, #32
+; CHECK-NEXT:    movi v16.8h, #128
+; CHECK-NEXT:    movi v17.8h, #1, lsl #8
+; CHECK-NEXT:    movi v18.8h, #8, lsl #8
+; CHECK-NEXT:    movi v19.8h, #16, lsl #8
+; CHECK-NEXT:    movi v20.8h, #64
+; CHECK-NEXT:    movi v21.8h, #2, lsl #8
+; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT:    movi v22.8h, #32, lsl #8
+; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
+; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
+; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT:    mul v2.8h, v0.8h, v2.8h
+; CHECK-NEXT:    mul v3.8h, v0.8h, v3.8h
+; CHECK-NEXT:    mul v4.8h, v0.8h, v4.8h
+; CHECK-NEXT:    mul v5.8h, v0.8h, v5.8h
+; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT:    movi v23.8h, #4, lsl #8
+; CHECK-NEXT:    movi v24.8h, #64, lsl #8
+; CHECK-NEXT:    mul v6.8h, v0.8h, v6.8h
+; CHECK-NEXT:    mul v7.8h, v0.8h, v7.8h
+; CHECK-NEXT:    mul v16.8h, v0.8h, v16.8h
+; CHECK-NEXT:    mul v17.8h, v0.8h, v17.8h
+; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT:    mul v18.8h, v0.8h, v18.8h
+; CHECK-NEXT:    mul v19.8h, v0.8h, v19.8h
+; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
+; CHECK-NEXT:    mul v4.8h, v0.8h, v20.8h
+; CHECK-NEXT:    movi v20.8h, #128, lsl #8
+; CHECK-NEXT:    mul v5.8h, v0.8h, v21.8h
+; CHECK-NEXT:    and v21.16b, v1.16b, v23.16b
+; CHECK-NEXT:    and v23.16b, v1.16b, v24.16b
+; CHECK-NEXT:    mul v22.8h, v0.8h, v22.8h
+; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
+; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
+; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v20.16b
+; CHECK-NEXT:    mul v3.8h, v0.8h, v21.8h
+; CHECK-NEXT:    mul v17.8h, v0.8h, v23.8h
+; CHECK-NEXT:    eor v4.16b, v6.16b, v4.16b
+; CHECK-NEXT:    eor v5.16b, v7.16b, v5.16b
+; CHECK-NEXT:    eor v6.16b, v16.16b, v22.16b
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    eor v1.16b, v2.16b, v4.16b
+; CHECK-NEXT:    eor v2.16b, v5.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v6.16b, v17.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @clmul_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: clmul_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.4s, #2
+; CHECK-NEXT:    movi v3.4s, #1
+; CHECK-NEXT:    movi v4.4s, #4
+; CHECK-NEXT:    movi v5.4s, #8
+; CHECK-NEXT:    movi v6.4s, #16
+; CHECK-NEXT:    movi v7.4s, #32
+; CHECK-NEXT:    movi v16.4s, #64
+; CHECK-NEXT:    movi v17.4s, #128
+; CHECK-NEXT:    movi v18.4s, #1, lsl #8
+; CHECK-NEXT:    movi v19.4s, #2, lsl #8
+; CHECK-NEXT:    movi v20.4s, #8, lsl #8
+; CHECK-NEXT:    movi v21.4s, #128, lsl #16
+; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
+; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
+; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
+; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    mul v5.4s, v0.4s, v5.4s
+; CHECK-NEXT:    mul v6.4s, v0.4s, v6.4s
+; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
+; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
+; CHECK-NEXT:    movi v22.4s, #8, lsl #16
+; CHECK-NEXT:    movi v23.4s, #2, lsl #24
+; CHECK-NEXT:    movi v25.4s, #4, lsl #24
+; CHECK-NEXT:    movi v24.4s, #32, lsl #16
+; CHECK-NEXT:    movi v26.4s, #8, lsl #24
+; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
+; CHECK-NEXT:    movi v4.4s, #16, lsl #8
+; CHECK-NEXT:    mul v5.4s, v0.4s, v16.4s
+; CHECK-NEXT:    mul v16.4s, v0.4s, v17.4s
+; CHECK-NEXT:    mul v17.4s, v0.4s, v18.4s
+; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT:    and v7.16b, v1.16b, v19.16b
+; CHECK-NEXT:    movi v19.4s, #32, lsl #8
+; CHECK-NEXT:    and v18.16b, v1.16b, v20.16b
+; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    movi v20.4s, #64, lsl #8
+; CHECK-NEXT:    mul v21.4s, v0.4s, v21.4s
+; CHECK-NEXT:    and v3.16b, v1.16b, v4.16b
+; CHECK-NEXT:    eor v5.16b, v6.16b, v5.16b
+; CHECK-NEXT:    movi v4.4s, #1, lsl #16
+; CHECK-NEXT:    eor v6.16b, v16.16b, v17.16b
+; CHECK-NEXT:    movi v16.4s, #2, lsl #16
+; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
+; CHECK-NEXT:    mul v18.4s, v0.4s, v18.4s
+; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT:    movi v17.4s, #4, lsl #8
+; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
+; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
+; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v1.16b, v16.16b
+; CHECK-NEXT:    movi v16.4s, #64, lsl #16
+; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
+; CHECK-NEXT:    mul v7.4s, v0.4s, v19.4s
+; CHECK-NEXT:    movi v19.4s, #4, lsl #16
+; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
+; CHECK-NEXT:    eor v3.16b, v18.16b, v3.16b
+; CHECK-NEXT:    and v18.16b, v1.16b, v20.16b
+; CHECK-NEXT:    movi v20.4s, #1, lsl #24
+; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    mul v5.4s, v0.4s, v5.4s
+; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT:    mul v17.4s, v0.4s, v17.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
+; CHECK-NEXT:    and v7.16b, v1.16b, v19.16b
+; CHECK-NEXT:    mul v18.4s, v0.4s, v18.4s
+; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT:    movi v19.4s, #128, lsl #8
+; CHECK-NEXT:    mul v16.4s, v0.4s, v16.4s
+; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT:    mul v5.4s, v0.4s, v7.4s
+; CHECK-NEXT:    and v7.16b, v1.16b, v22.16b
+; CHECK-NEXT:    movi v22.4s, #16, lsl #16
+; CHECK-NEXT:    mul v20.4s, v0.4s, v20.4s
+; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v18.16b
+; CHECK-NEXT:    and v17.16b, v1.16b, v19.16b
+; CHECK-NEXT:    mul v18.4s, v0.4s, v23.4s
+; CHECK-NEXT:    and v19.16b, v1.16b, v25.16b
+; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
+; CHECK-NEXT:    and v21.16b, v1.16b, v24.16b
+; CHECK-NEXT:    movi v23.4s, #32, lsl #24
+; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT:    mul v5.4s, v0.4s, v7.4s
+; CHECK-NEXT:    and v7.16b, v1.16b, v22.16b
+; CHECK-NEXT:    movi v22.4s, #16, lsl #24
+; CHECK-NEXT:    movi v24.4s, #64, lsl #24
+; CHECK-NEXT:    mul v17.4s, v0.4s, v17.4s
+; CHECK-NEXT:    eor v16.16b, v16.16b, v20.16b
+; CHECK-NEXT:    and v20.16b, v1.16b, v26.16b
+; CHECK-NEXT:    mul v19.4s, v0.4s, v19.4s
+; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
+; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    mul v6.4s, v0.4s, v21.4s
+; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT:    and v21.16b, v1.16b, v23.16b
+; CHECK-NEXT:    eor v5.16b, v16.16b, v18.16b
+; CHECK-NEXT:    movi v16.4s, #128, lsl #24
+; CHECK-NEXT:    mul v18.4s, v0.4s, v20.4s
+; CHECK-NEXT:    and v20.16b, v1.16b, v22.16b
+; CHECK-NEXT:    and v22.16b, v1.16b, v24.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v17.16b
+; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
+; CHECK-NEXT:    eor v5.16b, v5.16b, v19.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
+; CHECK-NEXT:    mul v7.4s, v0.4s, v20.4s
+; CHECK-NEXT:    mul v16.4s, v0.4s, v21.4s
+; CHECK-NEXT:    mul v17.4s, v0.4s, v22.4s
+; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v6.16b
+; CHECK-NEXT:    eor v4.16b, v5.16b, v18.16b
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
+; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %res
+}
+
+; TODO
+;define <2 x i64> @clmul_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+;  %res = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %a, <2 x i64> %b)
+;  ret <2 x i64> %res
+;}
+
+; TODO
+;define <16 x i8> @clmulr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+;  %a.ext = zext <16 x i8> %a to <16 x i16>
+;  %b.ext = zext <16 x i8> %b to <16 x i16>
+;  %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
+;  %res.ext = lshr <16 x i16> %clmul, splat (i16 7)
+;  %res = trunc <16 x i16> %res.ext to <16 x i8>
+;  ret <16 x i8> %res
+;}
+
+define <8 x i16> @clmulr_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: clmulr_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.16b, #15
+; CHECK-NEXT:    rev16 v1.16b, v1.16b
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    movi v7.8h, #1
+; CHECK-NEXT:    movi v17.8h, #4
+; CHECK-NEXT:    movi v18.8h, #8
+; CHECK-NEXT:    movi v19.8h, #16
+; CHECK-NEXT:    movi v20.8h, #32
+; CHECK-NEXT:    movi v22.8h, #128
+; CHECK-NEXT:    movi v23.8h, #1, lsl #8
+; CHECK-NEXT:    movi v25.8h, #8, lsl #8
+; CHECK-NEXT:    movi v26.8h, #16, lsl #8
+; CHECK-NEXT:    ushr v3.8h, v1.8h, #4
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ushr v4.8h, v0.8h, #4
+; CHECK-NEXT:    and v5.16b, v0.16b, v2.16b
+; CHECK-NEXT:    movi v0.16b, #51
+; CHECK-NEXT:    movi v21.8h, #64
+; CHECK-NEXT:    movi v27.8h, #32, lsl #8
+; CHECK-NEXT:    movi v24.8h, #4, lsl #8
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    shl v1.8h, v1.8h, #4
+; CHECK-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-NEXT:    shl v5.8h, v5.8h, #4
+; CHECK-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    orr v3.16b, v4.16b, v5.16b
+; CHECK-NEXT:    ushr v4.8h, v1.8h, #2
+; CHECK-NEXT:    and v1.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ushr v5.8h, v3.8h, #2
+; CHECK-NEXT:    and v3.16b, v3.16b, v0.16b
+; CHECK-NEXT:    and v4.16b, v4.16b, v0.16b
+; CHECK-NEXT:    shl v6.8h, v1.8h, #2
+; CHECK-NEXT:    movi v1.16b, #85
+; CHECK-NEXT:    and v5.16b, v5.16b, v0.16b
+; CHECK-NEXT:    shl v3.8h, v3.8h, #2
+; CHECK-NEXT:    orr v4.16b, v4.16b, v6.16b
+; CHECK-NEXT:    movi v6.8h, #2
+; CHECK-NEXT:    orr v3.16b, v5.16b, v3.16b
+; CHECK-NEXT:    ushr v5.8h, v4.8h, #1
+; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-NEXT:    ushr v16.8h, v3.8h, #1
+; CHECK-NEXT:    and v3.16b, v3.16b, v1.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    add v4.8h, v4.8h, v4.8h
+; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT:    add v3.8h, v3.8h, v3.8h
+; CHECK-NEXT:    orr v4.16b, v5.16b, v4.16b
+; CHECK-NEXT:    movi v5.8h, #2, lsl #8
+; CHECK-NEXT:    orr v3.16b, v16.16b, v3.16b
+; CHECK-NEXT:    movi v16.8h, #64, lsl #8
+; CHECK-NEXT:    and v6.16b, v4.16b, v6.16b
+; CHECK-NEXT:    and v7.16b, v4.16b, v7.16b
+; CHECK-NEXT:    and v17.16b, v4.16b, v17.16b
+; CHECK-NEXT:    and v18.16b, v4.16b, v18.16b
+; CHECK-NEXT:    and v19.16b, v4.16b, v19.16b
+; CHECK-NEXT:    and v20.16b, v4.16b, v20.16b
+; CHECK-NEXT:    and v22.16b, v4.16b, v22.16b
+; CHECK-NEXT:    and v23.16b, v4.16b, v23.16b
+; CHECK-NEXT:    and v25.16b, v4.16b, v25.16b
+; CHECK-NEXT:    and v26.16b, v4.16b, v26.16b
+; CHECK-NEXT:    mul v6.8h, v3.8h, v6.8h
+; CHECK-NEXT:    mul v7.8h, v3.8h, v7.8h
+; CHECK-NEXT:    mul v17.8h, v3.8h, v17.8h
+; CHECK-NEXT:    mul v18.8h, v3.8h, v18.8h
+; CHECK-NEXT:    and v21.16b, v4.16b, v21.16b
+; CHECK-NEXT:    and v5.16b, v4.16b, v5.16b
+; CHECK-NEXT:    and v27.16b, v4.16b, v27.16b
+; CHECK-NEXT:    mul v19.8h, v3.8h, v19.8h
+; CHECK-NEXT:    mul v20.8h, v3.8h, v20.8h
+; CHECK-NEXT:    mul v22.8h, v3.8h, v22.8h
+; CHECK-NEXT:    mul v23.8h, v3.8h, v23.8h
+; CHECK-NEXT:    mul v25.8h, v3.8h, v25.8h
+; CHECK-NEXT:    mul v26.8h, v3.8h, v26.8h
+; CHECK-NEXT:    eor v6.16b, v7.16b, v6.16b
+; CHECK-NEXT:    movi v7.8h, #128, lsl #8
+; CHECK-NEXT:    eor v17.16b, v17.16b, v18.16b
+; CHECK-NEXT:    and v18.16b, v4.16b, v24.16b
+; CHECK-NEXT:    mul v21.8h, v3.8h, v21.8h
+; CHECK-NEXT:    mul v5.8h, v3.8h, v5.8h
+; CHECK-NEXT:    mul v24.8h, v3.8h, v27.8h
+; CHECK-NEXT:    and v16.16b, v4.16b, v16.16b
+; CHECK-NEXT:    eor v19.16b, v19.16b, v20.16b
+; CHECK-NEXT:    eor v20.16b, v22.16b, v23.16b
+; CHECK-NEXT:    eor v22.16b, v25.16b, v26.16b
+; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT:    and v4.16b, v4.16b, v7.16b
+; CHECK-NEXT:    mul v7.8h, v3.8h, v18.8h
+; CHECK-NEXT:    mul v16.8h, v3.8h, v16.8h
+; CHECK-NEXT:    eor v17.16b, v19.16b, v21.16b
+; CHECK-NEXT:    eor v5.16b, v20.16b, v5.16b
+; CHECK-NEXT:    eor v18.16b, v22.16b, v24.16b
+; CHECK-NEXT:    mul v3.8h, v3.8h, v4.8h
+; CHECK-NEXT:    eor v4.16b, v6.16b, v17.16b
+; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
+; CHECK-NEXT:    eor v6.16b, v18.16b, v16.16b
+; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT:    eor v3.16b, v6.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    rev16 v3.16b, v3.16b
+; CHECK-NEXT:    ushr v4.8h, v3.8h, #4
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    shl v3.8h, v3.8h, #4
+; CHECK-NEXT:    orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ushr v3.8h, v2.8h, #2
+; CHECK-NEXT:    and v2.16b, v2.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    shl v2.8h, v2.8h, #2
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ushr v2.8h, v0.8h, #1
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %a.ext = zext <8 x i16> %a to <8 x i32>
+  %b.ext = zext <8 x i16> %b to <8 x i32>
+  %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
+  %res.ext = lshr <8 x i32> %clmul, splat (i32 15)
+  %res = trunc <8 x i32> %res.ext to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @clmulr_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: clmulr_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v1.16b, v1.16b
+; CHECK-NEXT:    rev32 v2.16b, v0.16b
+; CHECK-NEXT:    movi v3.4s, #2
+; CHECK-NEXT:    movi v4.4s, #1
+; CHECK-NEXT:    movi v5.4s, #4
+; CHECK-NEXT:    movi v6.4s, #8
+; CHECK-NEXT:    movi v7.4s, #16
+; CHECK-NEXT:    movi v16.4s, #32
+; CHECK-NEXT:    movi v17.4s, #64
+; CHECK-NEXT:    movi v18.4s, #1, lsl #8
+; CHECK-NEXT:    movi v19.4s, #2, lsl #8
+; CHECK-NEXT:    movi v20.4s, #8, lsl #8
+; CHECK-NEXT:    rbit v0.16b, v1.16b
+; CHECK-NEXT:    rbit v1.16b, v2.16b
+; CHECK-NEXT:    movi v2.4s, #128
+; CHECK-NEXT:    movi v21.4s, #16, lsl #8
+; CHECK-NEXT:    movi v22.4s, #8, lsl #16
+; CHECK-NEXT:    movi v23.4s, #2, lsl #24
+; CHECK-NEXT:    movi v25.4s, #4, lsl #24
+; CHECK-NEXT:    movi v24.4s, #32, lsl #16
+; CHECK-NEXT:    movi v26.4s, #8, lsl #24
+; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
+; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
+; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
+; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
+; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v5.4s, v1.4s, v5.4s
+; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
+; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT:    mul v2.4s, v1.4s, v2.4s
+; CHECK-NEXT:    and v23.16b, v0.16b, v23.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
+; CHECK-NEXT:    eor v6.16b, v7.16b, v16.16b
+; CHECK-NEXT:    mul v5.4s, v1.4s, v18.4s
+; CHECK-NEXT:    and v7.16b, v0.16b, v19.16b
+; CHECK-NEXT:    movi v18.4s, #32, lsl #8
+; CHECK-NEXT:    and v16.16b, v0.16b, v20.16b
+; CHECK-NEXT:    movi v19.4s, #1, lsl #16
+; CHECK-NEXT:    movi v20.4s, #4, lsl #8
+; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
+; CHECK-NEXT:    and v4.16b, v0.16b, v21.16b
+; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT:    movi v17.4s, #2, lsl #16
+; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT:    eor v5.16b, v2.16b, v5.16b
+; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT:    movi v21.4s, #64, lsl #8
+; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
+; CHECK-NEXT:    eor v2.16b, v3.16b, v6.16b
+; CHECK-NEXT:    and v3.16b, v0.16b, v19.16b
+; CHECK-NEXT:    movi v19.4s, #128, lsl #16
+; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
+; CHECK-NEXT:    and v6.16b, v0.16b, v17.16b
+; CHECK-NEXT:    movi v17.4s, #64, lsl #16
+; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
+; CHECK-NEXT:    mul v7.4s, v1.4s, v18.4s
+; CHECK-NEXT:    movi v18.4s, #4, lsl #16
+; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    eor v4.16b, v16.16b, v4.16b
+; CHECK-NEXT:    and v16.16b, v0.16b, v21.16b
+; CHECK-NEXT:    movi v21.4s, #1, lsl #24
+; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
+; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
+; CHECK-NEXT:    mul v20.4s, v1.4s, v20.4s
+; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
+; CHECK-NEXT:    and v7.16b, v0.16b, v18.16b
+; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
+; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
+; CHECK-NEXT:    movi v18.4s, #128, lsl #8
+; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    eor v5.16b, v5.16b, v20.16b
+; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
+; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
+; CHECK-NEXT:    movi v22.4s, #16, lsl #16
+; CHECK-NEXT:    mul v21.4s, v1.4s, v21.4s
+; CHECK-NEXT:    eor v4.16b, v4.16b, v16.16b
+; CHECK-NEXT:    and v20.16b, v0.16b, v24.16b
+; CHECK-NEXT:    movi v24.4s, #64, lsl #24
+; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
+; CHECK-NEXT:    eor v16.16b, v17.16b, v19.16b
+; CHECK-NEXT:    and v17.16b, v0.16b, v18.16b
+; CHECK-NEXT:    mul v18.4s, v1.4s, v23.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
+; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
+; CHECK-NEXT:    and v19.16b, v0.16b, v25.16b
+; CHECK-NEXT:    movi v22.4s, #16, lsl #24
+; CHECK-NEXT:    movi v23.4s, #32, lsl #24
+; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
+; CHECK-NEXT:    and v21.16b, v0.16b, v26.16b
+; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT:    mul v5.4s, v1.4s, v20.4s
+; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    eor v6.16b, v16.16b, v18.16b
+; CHECK-NEXT:    movi v16.4s, #128, lsl #24
+; CHECK-NEXT:    mul v18.4s, v1.4s, v21.4s
+; CHECK-NEXT:    and v20.16b, v0.16b, v22.16b
+; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
+; CHECK-NEXT:    and v22.16b, v0.16b, v24.16b
+; CHECK-NEXT:    eor v4.16b, v4.16b, v17.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
+; CHECK-NEXT:    eor v6.16b, v6.16b, v19.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT:    mul v7.4s, v1.4s, v20.4s
+; CHECK-NEXT:    mul v16.4s, v1.4s, v21.4s
+; CHECK-NEXT:    mul v17.4s, v1.4s, v22.4s
+; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
+; CHECK-NEXT:    eor v4.16b, v6.16b, v18.16b
+; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
+; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev32 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %a.ext = zext <4 x i32> %a to <4 x i64>
+  %b.ext = zext <4 x i32> %b to <4 x i64>
+  %clmul = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %a.ext, <4 x i64> %b.ext)
+  %res.ext = lshr <4 x i64> %clmul, splat (i64 31)
+  %res = trunc <4 x i64> %res.ext to <4 x i32>
+  ret <4 x i32> %res
+}
+
+; TODO
+;define <2 x i64> @clmulr_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+;  %a.ext = zext <2 x i64> %a to <2 x i128>
+;  %b.ext = zext <2 x i64> %b to <2 x i128>
+;  %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+;  %res.ext = lshr <2 x i128> %clmul, splat (i128 63)
+;  %res = trunc <2 x i128> %res.ext to <2 x i64>
+;  ret <2 x i64> %res
+;}
+
+; TODO
+;define <16 x i8> @clmulh_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+;  %a.ext = zext <16 x i8> %a to <16 x i16>
+;  %b.ext = zext <16 x i8> %b to <16 x i16>
+;  %clmul = call <16 x i16> @llvm.clmul.v16i16(<16 x i16> %a.ext, <16 x i16> %b.ext)
+;  %res.ext = lshr <16 x i16> %clmul, splat (i16 8)
+;  %res = trunc <16 x i16> %res.ext to <16 x i8>
+;  ret <16 x i8> %res
+;}
+
+define <8 x i16> @clmulh_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: clmulh_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.16b, #15
+; CHECK-NEXT:    rev16 v1.16b, v1.16b
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    movi v7.8h, #1
+; CHECK-NEXT:    movi v17.8h, #4
+; CHECK-NEXT:    movi v18.8h, #8
+; CHECK-NEXT:    movi v19.8h, #16
+; CHECK-NEXT:    movi v20.8h, #32
+; CHECK-NEXT:    movi v22.8h, #128
+; CHECK-NEXT:    movi v23.8h, #1, lsl #8
+; CHECK-NEXT:    movi v25.8h, #8, lsl #8
+; CHECK-NEXT:    movi v26.8h, #16, lsl #8
+; CHECK-NEXT:    ushr v3.8h, v1.8h, #4
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ushr v4.8h, v0.8h, #4
+; CHECK-NEXT:    and v5.16b, v0.16b, v2.16b
+; CHECK-NEXT:    movi v0.16b, #51
+; CHECK-NEXT:    movi v21.8h, #64
+; CHECK-NEXT:    movi v27.8h, #32, lsl #8
+; CHECK-NEXT:    movi v24.8h, #4, lsl #8
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    shl v1.8h, v1.8h, #4
+; CHECK-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-NEXT:    shl v5.8h, v5.8h, #4
+; CHECK-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    orr v3.16b, v4.16b, v5.16b
+; CHECK-NEXT:    ushr v4.8h, v1.8h, #2
+; CHECK-NEXT:    and v1.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ushr v5.8h, v3.8h, #2
+; CHECK-NEXT:    and v3.16b, v3.16b, v0.16b
+; CHECK-NEXT:    and v4.16b, v4.16b, v0.16b
+; CHECK-NEXT:    shl v6.8h, v1.8h, #2
+; CHECK-NEXT:    movi v1.16b, #85
+; CHECK-NEXT:    and v5.16b, v5.16b, v0.16b
+; CHECK-NEXT:    shl v3.8h, v3.8h, #2
+; CHECK-NEXT:    orr v4.16b, v4.16b, v6.16b
+; CHECK-NEXT:    movi v6.8h, #2
+; CHECK-NEXT:    orr v3.16b, v5.16b, v3.16b
+; CHECK-NEXT:    ushr v5.8h, v4.8h, #1
+; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-NEXT:    ushr v16.8h, v3.8h, #1
+; CHECK-NEXT:    and v3.16b, v3.16b, v1.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    add v4.8h, v4.8h, v4.8h
+; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT:    add v3.8h, v3.8h, v3.8h
+; CHECK-NEXT:    orr v4.16b, v5.16b, v4.16b
+; CHECK-NEXT:    movi v5.8h, #2, lsl #8
+; CHECK-NEXT:    orr v3.16b, v16.16b, v3.16b
+; CHECK-NEXT:    movi v16.8h, #64, lsl #8
+; CHECK-NEXT:    and v6.16b, v4.16b, v6.16b
+; CHECK-NEXT:    and v7.16b, v4.16b, v7.16b
+; CHECK-NEXT:    and v17.16b, v4.16b, v17.16b
+; CHECK-NEXT:    and v18.16b, v4.16b, v18.16b
+; CHECK-NEXT:    and v19.16b, v4.16b, v19.16b
+; CHECK-NEXT:    and v20.16b, v4.16b, v20.16b
+; CHECK-NEXT:    and v22.16b, v4.16b, v22.16b
+; CHECK-NEXT:    and v23.16b, v4.16b, v23.16b
+; CHECK-NEXT:    and v25.16b, v4.16b, v25.16b
+; CHECK-NEXT:    and v26.16b, v4.16b, v26.16b
+; CHECK-NEXT:    mul v6.8h, v3.8h, v6.8h
+; CHECK-NEXT:    mul v7.8h, v3.8h, v7.8h
+; CHECK-NEXT:    mul v17.8h, v3.8h, v17.8h
+; CHECK-NEXT:    mul v18.8h, v3.8h, v18.8h
+; CHECK-NEXT:    and v21.16b, v4.16b, v21.16b
+; CHECK-NEXT:    and v5.16b, v4.16b, v5.16b
+; CHECK-NEXT:    and v27.16b, v4.16b, v27.16b
+; CHECK-NEXT:    mul v19.8h, v3.8h, v19.8h
+; CHECK-NEXT:    mul v20.8h, v3.8h, v20.8h
+; CHECK-NEXT:    mul v22.8h, v3.8h, v22.8h
+; CHECK-NEXT:    mul v23.8h, v3.8h, v23.8h
+; CHECK-NEXT:    mul v25.8h, v3.8h, v25.8h
+; CHECK-NEXT:    mul v26.8h, v3.8h, v26.8h
+; CHECK-NEXT:    eor v6.16b, v7.16b, v6.16b
+; CHECK-NEXT:    movi v7.8h, #128, lsl #8
+; CHECK-NEXT:    eor v17.16b, v17.16b, v18.16b
+; CHECK-NEXT:    and v18.16b, v4.16b, v24.16b
+; CHECK-NEXT:    mul v21.8h, v3.8h, v21.8h
+; CHECK-NEXT:    mul v5.8h, v3.8h, v5.8h
+; CHECK-NEXT:    mul v24.8h, v3.8h, v27.8h
+; CHECK-NEXT:    and v16.16b, v4.16b, v16.16b
+; CHECK-NEXT:    eor v19.16b, v19.16b, v20.16b
+; CHECK-NEXT:    eor v20.16b, v22.16b, v23.16b
+; CHECK-NEXT:    eor v22.16b, v25.16b, v26.16b
+; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT:    and v4.16b, v4.16b, v7.16b
+; CHECK-NEXT:    mul v7.8h, v3.8h, v18.8h
+; CHECK-NEXT:    mul v16.8h, v3.8h, v16.8h
+; CHECK-NEXT:    eor v17.16b, v19.16b, v21.16b
+; CHECK-NEXT:    eor v5.16b, v20.16b, v5.16b
+; CHECK-NEXT:    eor v18.16b, v22.16b, v24.16b
+; CHECK-NEXT:    mul v3.8h, v3.8h, v4.8h
+; CHECK-NEXT:    eor v4.16b, v6.16b, v17.16b
+; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
+; CHECK-NEXT:    eor v6.16b, v18.16b, v16.16b
+; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
+; CHECK-NEXT:    eor v3.16b, v6.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    rev16 v3.16b, v3.16b
+; CHECK-NEXT:    ushr v4.8h, v3.8h, #4
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    shl v3.8h, v3.8h, #4
+; CHECK-NEXT:    orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ushr v3.8h, v2.8h, #2
+; CHECK-NEXT:    and v2.16b, v2.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    shl v2.8h, v2.8h, #2
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ushr v2.8h, v0.8h, #1
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
+; CHECK-NEXT:    ret
+  %a.ext = zext <8 x i16> %a to <8 x i32>
+  %b.ext = zext <8 x i16> %b to <8 x i32>
+  %clmul = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %a.ext, <8 x i32> %b.ext)
+  %res.ext = lshr <8 x i32> %clmul, splat (i32 16)
+  %res = trunc <8x i32> %res.ext to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @clmulh_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: clmulh_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v1.16b, v1.16b
+; CHECK-NEXT:    rev32 v2.16b, v0.16b
+; CHECK-NEXT:    movi v3.4s, #2
+; CHECK-NEXT:    movi v4.4s, #1
+; CHECK-NEXT:    movi v5.4s, #4
+; CHECK-NEXT:    movi v6.4s, #8
+; CHECK-NEXT:    movi v7.4s, #16
+; CHECK-NEXT:    movi v16.4s, #32
+; CHECK-NEXT:    movi v17.4s, #64
+; CHECK-NEXT:    movi v18.4s, #1, lsl #8
+; CHECK-NEXT:    movi v19.4s, #2, lsl #8
+; CHECK-NEXT:    movi v20.4s, #8, lsl #8
+; CHECK-NEXT:    rbit v0.16b, v1.16b
+; CHECK-NEXT:    rbit v1.16b, v2.16b
+; CHECK-NEXT:    movi v2.4s, #128
+; CHECK-NEXT:    movi v21.4s, #16, lsl #8
+; CHECK-NEXT:    movi v22.4s, #8, lsl #16
+; CHECK-NEXT:    movi v23.4s, #2, lsl #24
+; CHECK-NEXT:    movi v25.4s, #4, lsl #24
+; CHECK-NEXT:    movi v24.4s, #32, lsl #16
+; CHECK-NEXT:    movi v26.4s, #8, lsl #24
+; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
+; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
+; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
+; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
+; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
+; CHECK-NEXT:    mul v5.4s, v1.4s, v5.4s
+; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
+; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT:    mul v2.4s, v1.4s, v2.4s
+; CHECK-NEXT:    and v23.16b, v0.16b, v23.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
+; CHECK-NEXT:    eor v6.16b, v7.16b, v16.16b
+; CHECK-NEXT:    mul v5.4s, v1.4s, v18.4s
+; CHECK-NEXT:    and v7.16b, v0.16b, v19.16b
+; CHECK-NEXT:    movi v18.4s, #32, lsl #8
+; CHECK-NEXT:    and v16.16b, v0.16b, v20.16b
+; CHECK-NEXT:    movi v19.4s, #1, lsl #16
+; CHECK-NEXT:    movi v20.4s, #4, lsl #8
+; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
+; CHECK-NEXT:    and v4.16b, v0.16b, v21.16b
+; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
+; CHECK-NEXT:    movi v17.4s, #2, lsl #16
+; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT:    eor v5.16b, v2.16b, v5.16b
+; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
+; CHECK-NEXT:    movi v21.4s, #64, lsl #8
+; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
+; CHECK-NEXT:    eor v2.16b, v3.16b, v6.16b
+; CHECK-NEXT:    and v3.16b, v0.16b, v19.16b
+; CHECK-NEXT:    movi v19.4s, #128, lsl #16
+; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
+; CHECK-NEXT:    and v6.16b, v0.16b, v17.16b
+; CHECK-NEXT:    movi v17.4s, #64, lsl #16
+; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
+; CHECK-NEXT:    mul v7.4s, v1.4s, v18.4s
+; CHECK-NEXT:    movi v18.4s, #4, lsl #16
+; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    eor v4.16b, v16.16b, v4.16b
+; CHECK-NEXT:    and v16.16b, v0.16b, v21.16b
+; CHECK-NEXT:    movi v21.4s, #1, lsl #24
+; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
+; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
+; CHECK-NEXT:    mul v20.4s, v1.4s, v20.4s
+; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
+; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
+; CHECK-NEXT:    and v7.16b, v0.16b, v18.16b
+; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
+; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
+; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
+; CHECK-NEXT:    movi v18.4s, #128, lsl #8
+; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    eor v5.16b, v5.16b, v20.16b
+; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
+; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
+; CHECK-NEXT:    movi v22.4s, #16, lsl #16
+; CHECK-NEXT:    mul v21.4s, v1.4s, v21.4s
+; CHECK-NEXT:    eor v4.16b, v4.16b, v16.16b
+; CHECK-NEXT:    and v20.16b, v0.16b, v24.16b
+; CHECK-NEXT:    movi v24.4s, #64, lsl #24
+; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
+; CHECK-NEXT:    eor v16.16b, v17.16b, v19.16b
+; CHECK-NEXT:    and v17.16b, v0.16b, v18.16b
+; CHECK-NEXT:    mul v18.4s, v1.4s, v23.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
+; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
+; CHECK-NEXT:    and v19.16b, v0.16b, v25.16b
+; CHECK-NEXT:    movi v22.4s, #16, lsl #24
+; CHECK-NEXT:    movi v23.4s, #32, lsl #24
+; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
+; CHECK-NEXT:    and v21.16b, v0.16b, v26.16b
+; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
+; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
+; CHECK-NEXT:    mul v5.4s, v1.4s, v20.4s
+; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    eor v6.16b, v16.16b, v18.16b
+; CHECK-NEXT:    movi v16.4s, #128, lsl #24
+; CHECK-NEXT:    mul v18.4s, v1.4s, v21.4s
+; CHECK-NEXT:    and v20.16b, v0.16b, v22.16b
+; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
+; CHECK-NEXT:    and v22.16b, v0.16b, v24.16b
+; CHECK-NEXT:    eor v4.16b, v4.16b, v17.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
+; CHECK-NEXT:    eor v6.16b, v6.16b, v19.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT:    mul v7.4s, v1.4s, v20.4s
+; CHECK-NEXT:    mul v16.4s, v1.4s, v21.4s
+; CHECK-NEXT:    mul v17.4s, v1.4s, v22.4s
+; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
+; CHECK-NEXT:    eor v4.16b, v6.16b, v18.16b
+; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
+; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev32 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v0.16b, v0.16b
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
+; CHECK-NEXT:    ret
+  %a.ext = zext <4 x i32> %a to <4 x i64>
+  %b.ext = zext <4 x i32> %b to <4 x i64>
+  %clmul = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %a.ext, <4 x i64> %b.ext)
+  %res.ext = lshr <4 x i64> %clmul, splat (i64 32)
+  %res = trunc <4 x i64> %res.ext to <4 x i32>
+  ret <4 x i32> %res
+}
+
+; TODO
+;define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
+;  %a.ext = zext <2 x i64> %a to <2 x i128>
+;  %b.ext = zext <2 x i64> %b to <2 x i128>
+;  %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %a.ext, <2 x i128> %b.ext)
+;  %res.ext = lshr <2 x i128> %clmul, splat (i128 64)
+;  %res = trunc <2 x i128> %res.ext to <2 x i64>
+;  ret <2 x i64> %res
+;}



More information about the llvm-commits mailing list