[llvm] [RISCV] Promote i8/i16/i32 scalable vector CLMUL to i64 CLMUL with Zvbc. (PR #184265)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 2 15:46:15 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Craig Topper (topperc)
<details>
<summary>Changes</summary>
This handles the simple case where we can widen to i64 vector
without splitting. More work will be done in follow ups.
Stacked on #<!-- -->184257
---
Patch is 4.54 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/184265.diff
11 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp (+3)
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+24-2)
- (modified) llvm/test/CodeGen/AArch64/clmul-fixed.ll (+1085-1271)
- (modified) llvm/test/CodeGen/AArch64/clmul-scalable.ll (+213-420)
- (modified) llvm/test/CodeGen/PowerPC/clmul-vector.ll (+3580-2824)
- (modified) llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll (+5535-3621)
- (modified) llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll (+37820-47041)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll (+384-384)
- (modified) llvm/test/CodeGen/X86/clmul-vector-256.ll (+882-861)
- (modified) llvm/test/CodeGen/X86/clmul-vector-512.ll (+1140-1060)
- (modified) llvm/test/CodeGen/X86/clmul-vector.ll (+2722-2729)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 0b1d5bfd078d8..74fe5c5819982 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -389,6 +389,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::CTLZ_ZERO_UNDEF:
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTPOP:
+ case ISD::CLMUL:
+ case ISD::CLMULH:
+ case ISD::CLMULR:
case ISD::SELECT:
case ISD::VSELECT:
case ISD::SELECT_CC:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a8542be937a87..2bdffad2ded26 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1116,8 +1116,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
}
- if (Subtarget.hasStdExtZvbc() && VT.getVectorElementType() == MVT::i64)
- setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
+ if (Subtarget.hasStdExtZvbc() && Subtarget.hasVInstructionsI64()) {
+ if (VT.getVectorElementType() == MVT::i64)
+ setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
+ else {
+ // Promote to i64 if the lmul is small enough.
+ // FIXME: Split if necessary to widen.
+ // FIXME: Promote clmulh directly without legalizing to clmul first.
+ MVT I64VecVT = MVT::getVectorVT(MVT::i64, VT.getVectorElementCount());
+ if (isTypeLegal(I64VecVT))
+ setOperationAction(ISD::CLMUL, VT, Custom);
+ }
+ }
setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
}
@@ -8920,6 +8930,18 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerToScalableOp(Op, DAG);
assert(Op.getOpcode() != ISD::CTTZ);
return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
+ case ISD::CLMUL: {
+ assert(Op.getValueType().isScalableVector() && Subtarget.hasStdExtZvbc() &&
+ "Unexpected custom legalisation");
+ // Promote to i64 vector.
+ MVT VT = Op.getSimpleValueType();
+ MVT I64VecVT = MVT::getVectorVT(MVT::i64, VT.getVectorElementCount());
+ SDLoc DL(Op);
+ SDValue Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, I64VecVT, Op.getOperand(0));
+ SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, I64VecVT, Op.getOperand(1));
+ SDValue CLMUL = DAG.getNode(ISD::CLMUL, DL, I64VecVT, Op0, Op1);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, CLMUL);
+ }
case ISD::FCOPYSIGN:
if (Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16)
return lowerFCOPYSIGN(Op, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
index 8205d6c80221d..23692dc456fc2 100644
--- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
@@ -1730,23 +1730,23 @@ define <1 x i128> @clmul_v1i128_neon(<1 x i128> %x, <1 x i128> %y) {
define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: clmul_v8i16_neon_zext:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.8h, #2
-; CHECK-NEXT: movi v3.8h, #1
-; CHECK-NEXT: movi v4.8h, #4
-; CHECK-NEXT: movi v5.8h, #8
-; CHECK-NEXT: movi v6.8h, #16
-; CHECK-NEXT: movi v7.8h, #32
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: movi v16.8h, #64
-; CHECK-NEXT: movi v17.8h, #128
-; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v17.16b
+; CHECK-NEXT: mov v2.16b, v1.16b
+; CHECK-NEXT: mov v3.16b, v1.16b
+; CHECK-NEXT: mov v4.16b, v1.16b
+; CHECK-NEXT: mov v5.16b, v1.16b
+; CHECK-NEXT: mov v6.16b, v1.16b
+; CHECK-NEXT: mov v7.16b, v1.16b
+; CHECK-NEXT: mov v16.16b, v1.16b
+; CHECK-NEXT: bic v1.8h, #127
+; CHECK-NEXT: bic v2.8h, #253
+; CHECK-NEXT: bic v3.8h, #254
+; CHECK-NEXT: bic v4.8h, #251
+; CHECK-NEXT: bic v5.8h, #247
+; CHECK-NEXT: bic v6.8h, #239
+; CHECK-NEXT: bic v7.8h, #223
+; CHECK-NEXT: bic v16.8h, #191
+; CHECK-NEXT: xtn v1.8b, v1.8h
; CHECK-NEXT: xtn v2.8b, v2.8h
; CHECK-NEXT: xtn v3.8b, v3.8h
; CHECK-NEXT: xtn v4.8b, v4.8h
@@ -1754,7 +1754,6 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-NEXT: xtn v6.8b, v6.8h
; CHECK-NEXT: xtn v7.8b, v7.8h
; CHECK-NEXT: xtn v16.8b, v16.8h
-; CHECK-NEXT: xtn v1.8b, v1.8h
; CHECK-NEXT: umull v2.8h, v0.8b, v2.8b
; CHECK-NEXT: umull v3.8h, v0.8b, v3.8b
; CHECK-NEXT: umull v4.8h, v0.8b, v4.8b
@@ -1780,89 +1779,84 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: clmul_v16i16_neon_zext:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v4.8h, #2
-; CHECK-NEXT: ushll v2.8h, v1.8b, #0
-; CHECK-NEXT: movi v5.8h, #1
-; CHECK-NEXT: movi v6.8h, #4
-; CHECK-NEXT: movi v7.8h, #8
-; CHECK-NEXT: movi v17.8h, #16
-; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT: movi v18.8h, #32
-; CHECK-NEXT: movi v1.8h, #128
-; CHECK-NEXT: movi v19.8h, #64
-; CHECK-NEXT: movi v25.2d, #0000000000000000
-; CHECK-NEXT: and v16.16b, v2.16b, v4.16b
-; CHECK-NEXT: and v20.16b, v2.16b, v5.16b
-; CHECK-NEXT: and v21.16b, v2.16b, v6.16b
-; CHECK-NEXT: and v22.16b, v2.16b, v7.16b
-; CHECK-NEXT: and v4.16b, v3.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v3.16b, v5.16b
-; CHECK-NEXT: and v6.16b, v3.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v3.16b, v7.16b
-; CHECK-NEXT: and v23.16b, v3.16b, v17.16b
-; CHECK-NEXT: and v24.16b, v3.16b, v18.16b
-; CHECK-NEXT: and v26.16b, v3.16b, v1.16b
-; CHECK-NEXT: and v17.16b, v2.16b, v17.16b
-; CHECK-NEXT: and v18.16b, v2.16b, v18.16b
+; CHECK-NEXT: ushll2 v2.8h, v1.16b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: mov v4.16b, v2.16b
+; CHECK-NEXT: mov v5.16b, v2.16b
+; CHECK-NEXT: mov v6.16b, v2.16b
+; CHECK-NEXT: mov v7.16b, v2.16b
+; CHECK-NEXT: mov v16.16b, v2.16b
+; CHECK-NEXT: mov v17.16b, v2.16b
+; CHECK-NEXT: mov v18.16b, v1.16b
+; CHECK-NEXT: mov v19.16b, v1.16b
+; CHECK-NEXT: mov v20.16b, v1.16b
+; CHECK-NEXT: mov v21.16b, v1.16b
+; CHECK-NEXT: mov v22.16b, v1.16b
+; CHECK-NEXT: mov v23.16b, v1.16b
+; CHECK-NEXT: bic v4.8h, #253
+; CHECK-NEXT: bic v5.8h, #254
+; CHECK-NEXT: bic v6.8h, #251
+; CHECK-NEXT: bic v7.8h, #247
+; CHECK-NEXT: mov v3.16b, v2.16b
+; CHECK-NEXT: bic v16.8h, #239
+; CHECK-NEXT: bic v17.8h, #223
+; CHECK-NEXT: bic v18.8h, #253
+; CHECK-NEXT: bic v19.8h, #254
+; CHECK-NEXT: bic v20.8h, #251
+; CHECK-NEXT: bic v21.8h, #247
+; CHECK-NEXT: bic v22.8h, #239
+; CHECK-NEXT: bic v23.8h, #223
+; CHECK-NEXT: mov v24.16b, v1.16b
; CHECK-NEXT: uzp1 v4.16b, v0.16b, v4.16b
; CHECK-NEXT: uzp1 v5.16b, v0.16b, v5.16b
; CHECK-NEXT: uzp1 v6.16b, v0.16b, v6.16b
; CHECK-NEXT: uzp1 v7.16b, v0.16b, v7.16b
-; CHECK-NEXT: uzp1 v23.16b, v0.16b, v23.16b
-; CHECK-NEXT: uzp1 v24.16b, v0.16b, v24.16b
-; CHECK-NEXT: and v3.16b, v3.16b, v19.16b
-; CHECK-NEXT: uzp1 v26.16b, v0.16b, v26.16b
-; CHECK-NEXT: uzp1 v25.16b, v0.16b, v25.16b
-; CHECK-NEXT: xtn v16.8b, v16.8h
+; CHECK-NEXT: bic v3.8h, #191
+; CHECK-NEXT: uzp1 v16.16b, v0.16b, v16.16b
+; CHECK-NEXT: uzp1 v17.16b, v0.16b, v17.16b
+; CHECK-NEXT: xtn v18.8b, v18.8h
+; CHECK-NEXT: xtn v19.8b, v19.8h
; CHECK-NEXT: xtn v20.8b, v20.8h
; CHECK-NEXT: xtn v21.8b, v21.8h
; CHECK-NEXT: xtn v22.8b, v22.8h
-; CHECK-NEXT: xtn v17.8b, v17.8h
-; CHECK-NEXT: xtn v18.8b, v18.8h
-; CHECK-NEXT: and v19.16b, v2.16b, v19.16b
-; CHECK-NEXT: uzp1 v3.16b, v0.16b, v3.16b
+; CHECK-NEXT: xtn v23.8b, v23.8h
+; CHECK-NEXT: bic v24.8h, #191
; CHECK-NEXT: umull2 v4.8h, v0.16b, v4.16b
; CHECK-NEXT: umull2 v5.8h, v0.16b, v5.16b
; CHECK-NEXT: umull2 v6.8h, v0.16b, v6.16b
; CHECK-NEXT: umull2 v7.8h, v0.16b, v7.16b
-; CHECK-NEXT: umull2 v23.8h, v0.16b, v23.16b
-; CHECK-NEXT: umull2 v24.8h, v0.16b, v24.16b
-; CHECK-NEXT: umull2 v26.8h, v0.16b, v26.16b
-; CHECK-NEXT: umull2 v25.8h, v0.16b, v25.16b
-; CHECK-NEXT: xtn v19.8b, v19.8h
-; CHECK-NEXT: umull v16.8h, v0.8b, v16.8b
+; CHECK-NEXT: uzp1 v3.16b, v0.16b, v3.16b
+; CHECK-NEXT: umull2 v16.8h, v0.16b, v16.16b
+; CHECK-NEXT: umull2 v17.8h, v0.16b, v17.16b
+; CHECK-NEXT: umull v18.8h, v0.8b, v18.8b
+; CHECK-NEXT: xtn v24.8b, v24.8h
+; CHECK-NEXT: umull v19.8h, v0.8b, v19.8b
; CHECK-NEXT: umull v20.8h, v0.8b, v20.8b
; CHECK-NEXT: umull v21.8h, v0.8b, v21.8b
; CHECK-NEXT: umull v22.8h, v0.8b, v22.8b
-; CHECK-NEXT: umull v17.8h, v0.8b, v17.8b
-; CHECK-NEXT: umull v18.8h, v0.8b, v18.8b
-; CHECK-NEXT: umull2 v3.8h, v0.16b, v3.16b
+; CHECK-NEXT: umull v23.8h, v0.8b, v23.8b
+; CHECK-NEXT: bic v2.8h, #127
+; CHECK-NEXT: bic v1.8h, #127
; CHECK-NEXT: eor v4.16b, v5.16b, v4.16b
; CHECK-NEXT: eor v5.16b, v6.16b, v7.16b
-; CHECK-NEXT: eor v6.16b, v23.16b, v24.16b
-; CHECK-NEXT: eor v7.16b, v26.16b, v25.16b
-; CHECK-NEXT: eor v23.16b, v25.16b, v25.16b
-; CHECK-NEXT: and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT: umull v2.8h, v0.8b, v19.8b
-; CHECK-NEXT: eor v16.16b, v20.16b, v16.16b
-; CHECK-NEXT: eor v19.16b, v21.16b, v22.16b
-; CHECK-NEXT: eor v17.16b, v17.16b, v18.16b
+; CHECK-NEXT: umull2 v3.8h, v0.16b, v3.16b
+; CHECK-NEXT: eor v6.16b, v16.16b, v17.16b
+; CHECK-NEXT: umull v7.8h, v0.8b, v24.8b
+; CHECK-NEXT: eor v16.16b, v19.16b, v18.16b
+; CHECK-NEXT: eor v17.16b, v20.16b, v21.16b
+; CHECK-NEXT: eor v18.16b, v22.16b, v23.16b
+; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-NEXT: xtn v1.8b, v1.8h
; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
; CHECK-NEXT: eor v3.16b, v6.16b, v3.16b
-; CHECK-NEXT: eor v5.16b, v7.16b, v25.16b
-; CHECK-NEXT: eor v6.16b, v23.16b, v25.16b
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: eor v7.16b, v16.16b, v19.16b
-; CHECK-NEXT: eor v2.16b, v17.16b, v2.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT: eor v4.16b, v5.16b, v25.16b
-; CHECK-NEXT: eor v5.16b, v6.16b, v25.16b
+; CHECK-NEXT: eor v5.16b, v16.16b, v17.16b
+; CHECK-NEXT: eor v6.16b, v18.16b, v7.16b
+; CHECK-NEXT: umull2 v2.8h, v0.16b, v2.16b
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: eor v1.16b, v7.16b, v2.16b
-; CHECK-NEXT: eor v2.16b, v3.16b, v4.16b
-; CHECK-NEXT: eor v3.16b, v5.16b, v25.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT: eor v1.16b, v4.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v5.16b, v6.16b
+; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
; CHECK-NEXT: ret
%zextx = zext <16 x i8> %x to <16 x i16>
%zexty = zext <16 x i8> %y to <16 x i16>
@@ -1963,168 +1957,162 @@ define <4 x i32> @clmul_v4i32_neon_zext(<4 x i16> %x, <4 x i16> %y) {
define <8 x i32> @clmul_v8i32_neon_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: clmul_v8i32_neon_zext:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d9, d8, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: str d12, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset b8, -8
; CHECK-NEXT: .cfi_offset b9, -16
-; CHECK-NEXT: movi v4.4s, #2
-; CHECK-NEXT: movi v5.4s, #1
-; CHECK-NEXT: movi v6.4s, #4
-; CHECK-NEXT: ushll v2.4s, v1.4h, #0
-; CHECK-NEXT: movi v3.4s, #8
-; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT: movi v20.4s, #16
-; CHECK-NEXT: movi v21.4s, #32
-; CHECK-NEXT: and v17.16b, v2.16b, v4.16b
-; CHECK-NEXT: and v7.16b, v2.16b, v5.16b
-; CHECK-NEXT: and v16.16b, v2.16b, v6.16b
-; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT: and v18.16b, v1.16b, v3.16b
-; CHECK-NEXT: and v3.16b, v2.16b, v3.16b
-; CHECK-NEXT: and v24.16b, v1.16b, v20.16b
-; CHECK-NEXT: xtn v17.4h, v17.4s
-; CHECK-NEXT: xtn v7.4h, v7.4s
-; CHECK-NEXT: xtn v19.4h, v16.4s
-; CHECK-NEXT: uzp1 v4.8h, v0.8h, v4.8h
-; CHECK-NEXT: uzp1 v22.8h, v0.8h, v5.8h
-; CHECK-NEXT: uzp1 v23.8h, v0.8h, v6.8h
-; CHECK-NEXT: uzp1 v18.8h, v0.8h, v18.8h
-; CHECK-NEXT: and v25.16b, v1.16b, v21.16b
-; CHECK-NEXT: movi v6.4s, #128
-; CHECK-NEXT: uzp1 v24.8h, v0.8h, v24.8h
-; CHECK-NEXT: and v28.16b, v2.16b, v20.16b
-; CHECK-NEXT: and v21.16b, v2.16b, v21.16b
-; CHECK-NEXT: umull v5.4s, v0.4h, v17.4h
-; CHECK-NEXT: umull v16.4s, v0.4h, v7.4h
-; CHECK-NEXT: umull v17.4s, v0.4h, v19.4h
-; CHECK-NEXT: xtn v19.4h, v3.4s
+; CHECK-NEXT: .cfi_offset b10, -24
+; CHECK-NEXT: .cfi_offset b11, -32
+; CHECK-NEXT: .cfi_offset b12, -48
+; CHECK-NEXT: movi v19.4s, #2
+; CHECK-NEXT: movi v21.4s, #1
+; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-NEXT: movi v17.4s, #4
+; CHECK-NEXT: movi v20.4s, #8
+; CHECK-NEXT: movi v5.4s, #16
+; CHECK-NEXT: movi v4.4s, #32
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: and v3.16b, v2.16b, v19.16b
+; CHECK-NEXT: and v6.16b, v2.16b, v21.16b
+; CHECK-NEXT: and v7.16b, v2.16b, v17.16b
+; CHECK-NEXT: and v16.16b, v2.16b, v20.16b
+; CHECK-NEXT: and v18.16b, v2.16b, v5.16b
+; CHECK-NEXT: and v22.16b, v2.16b, v4.16b
+; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT: uzp1 v23.8h, v0.8h, v3.8h
; CHECK-NEXT: movi v3.4s, #64
+; CHECK-NEXT: uzp1 v24.8h, v0.8h, v6.8h
+; CHECK-NEXT: movi v6.4s, #128
+; CHECK-NEXT: uzp1 v25.8h, v0.8h, v7.8h
; CHECK-NEXT: movi v7.4s, #1, lsl #8
-; CHECK-NEXT: umull2 v26.4s, v0.8h, v4.8h
-; CHECK-NEXT: umull2 v22.4s, v0.8h, v22.8h
-; CHECK-NEXT: umull2 v23.4s, v0.8h, v23.8h
-; CHECK-NEXT: umull2 v27.4s, v0.8h, v18.8h
-; CHECK-NEXT: uzp1 v25.8h, v0.8h, v25.8h
-; CHECK-NEXT: movi v4.4s, #2, lsl #8
-; CHECK-NEXT: and v30.16b, v1.16b, v6.16b
-; CHECK-NEXT: movi v18.4s, #8, lsl #8
-; CHECK-NEXT: movi v20.4s, #16, lsl #8
-; CHECK-NEXT: and v29.16b, v1.16b, v3.16b
-; CHECK-NEXT: and v31.16b, v1.16b, v7.16b
+; CHECK-NEXT: uzp1 v26.8h, v0.8h, v16.8h
+; CHECK-NEXT: uzp1 v27.8h, v0.8h, v18.8h
+; CHECK-NEXT: uzp1 v28.8h, v0.8h, v22.8h
+; CHECK-NEXT: movi v16.4s, #8, lsl #8
+; CHECK-NEXT: movi v18.4s, #16, lsl #8
+; CHECK-NEXT: movi v22.4s, #2, lsl #8
+; CHECK-NEXT: umull2 v29.4s, v0.8h, v23.8h
+; CHECK-NEXT: and v23.16b, v2.16b, v3.16b
; CHECK-NEXT: umull2 v24.4s, v0.8h, v24.8h
-; CHECK-NEXT: eor v22.16b, v22.16b, v26.16b
-; CHECK-NEXT: xtn v28.4h, v28.4s
-; CHECK-NEXT: umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT: eor v23.16b, v23.16b, v27.16b
+; CHECK-NEXT: and v30.16b, v2.16b, v6.16b
+; CHECK-NEXT: and v31.16b, v2.16b, v7.16b
; CHECK-NEXT: umull2 v25.4s, v0.8h, v25.8h
-; CHECK-NEXT: uzp1 v27.8h, v0.8h, v30.8h
-; CHECK-NEXT: uzp1 v26.8h, v0.8h, v29.8h
-; CHECK-NEXT: uzp1 v29.8h, v0.8h, v31.8h
-; CHECK-NEXT: and v30.16b, v1.16b, v4.16b
-; CHECK-NEXT: xtn v31.4h, v21.4s
-; CHECK-NEXT: movi v21.4s, #32, lsl #8
-; CHECK-NEXT: and v8.16b, v1.16b, v20.16b
-; CHECK-NEXT: eor v22.16b, v22.16b, v23.16b
-; CHECK-NEXT: and v23.16b, v1.16b, v18.16b
-; CHECK-NEXT: umull v28.4s, v0.4h, v28.4h
-; CHECK-NEXT: eor v24.16b, v24.16b, v25.16b
-; CHECK-NEXT: umull2 v27.4s, v0.8h, v27.8h
-; CHECK-NEXT: eor v16.16b, v16.16b, v5.16b
-; CHECK-NEXT: umull2 v25.4s, v0.8h, v26.8h
-; CHECK-NEXT: uzp1 v26.8h, v0.8h, v30.8h
-; CHECK-NEXT: umull2 v29.4s, v0.8h, v29.8h
-; CHECK-NEXT: movi v30.2d, #0000000000000000
-; CHECK-NEXT: uzp1 v23.8h, v0.8h, v23.8h
-; CHECK-NEXT: uzp1 v8.8h, v0.8h, v8.8h
-; CHECK-NEXT: and v9.16b, v1.16b, v21.16b
-; CHECK-NEXT: umull v31.4s, v0.4h, v31.4h
-; CHECK-NEXT: eor v17.16b, v17.16b, v19.16b
-; CHECK-NEXT: and v6.16b, v2.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v2.16b, v7.16b
-; CHECK-NEXT: and v18.16b, v2.16b, v18.16b
; CHECK-NEXT: umull2 v26.4s, v0.8h, v26.8h
-; CHECK-NEXT: eor v27.16b, v27.16b, v29.16b
-; CHECK-NEXT: eor v24.16b, v24.16b, v25.16b
-; CHECK-NEXT: uzp1 v29.8h, v0.8h, v9.8h
+; CHECK-NEXT: umull2 v27.4s, v0.8h, v27.8h
+; CHECK-NEXT: umull2 v28.4s, v0.8h, v28.8h
+; CHECK-NEXT: uzp1 v10.8h, v0.8h, v23.8h
+; CHECK-NEXT: movi v23.4s, #32, lsl #8
+; CHECK-NEXT: and v8.16b, v2.16b, v16.16b
+; CHECK-NEXT: and v9.16b, v2.16b, v18.16b
; CHECK-NEXT: uzp1 v30.8h, v0.8h, v30.8h
-; CHECK-NEXT: movi v9.4s, #64, lsl #8
-; CHECK-NEXT: umull2 v23.4s, v0.8h, v23.8h
+; CHECK-NEXT: uzp1 v31.8h, v0.8h, v31.8h
+; CHECK-NEXT: and v11.16b, v2.16b, v22.16b
+; CHECK-NEXT: eor v24.16b, v24.16b, v29.16b
+; CHECK-NEXT: xtn v12.4h, v19.4s
+; CHECK-NEXT: uzp1 v8.8h, v0.8h, v8.8h
+; CHECK-NEXT: eor v25.16b, v25.16b, v26.16b
+; CHECK-NEXT: eor v26.16b, v27.16b, v28.16b
+; CHECK-NEXT: uzp1 v9.8h, v0.8h, v9.8h
+; CHECK-NEXT: and v29.16b, v2.16b, v23.16b
+; CHECK-NEXT: umull2 v27.4s, v0.8h, v10.8h
+; CHECK-NEXT: umull2 v28.4s, v0.8h, v30.8h
+; CHECK-NEXT: uzp1 v30.8h, v0.8h, v11.8h
+; CHECK-NEXT: umull2 v31.4s, v0.8h, v31.8h
+; CHECK-NEXT: and v11.16b, v1.16b, v17.16b
+; CHECK-NEXT: eor v17.16b, v24.16b, v25.16b
+; CHECK-NEXT: and v10.16b, v1.16b, v21.16b
+; CHECK-NEXT: uzp1 v29.8h, v0.8h, v29.8h
; CHECK-NEXT: umull2 v8.4s, v0.8h, v8.8h
-; CHECK-NEXT: movi v25.4s, #4, lsl #8
-; CHECK-NEXT: eor v22.16b, v22.16b, v24.16b
-; CHECK-NEXT: eor v19.16b, v28.16b, v31.16b
-; CHECK-NEXT: movi v28.4s, #128, lsl #8
-; CHECK-NEXT: eor v24.16b, v27.16b, v26.16b
-; CHECK-NEXT: and v20.16b, v2.16b, v20.16b
-; CHECK-NEXT: xtn v6.4h, v6.4s
+; CHECK-NEXT: movi v21.4s, #4, lsl #8
+; CHECK-NEXT: umull2 v9.4s, v0.8h, v9.8h
+; CHECK-NEXT: eor v19.16b, v26.16b, v27.16b
+; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT: umull2 v24.4s, v0.8h, v30.8h
+; CHECK-NEXT: eor v25.16b, v28.16b, v31.16b
+; CHECK-NEXT: xtn v28.4h, v11.4s
+; CHECK-NEXT: xtn v30.4h, v20.4s
+; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
; CHECK-NEXT: umull2 v27.4s, v0.8h, v29.8h
-; CHECK-NEXT: umull2 v5.4s, v0.8h, v30.8h
-; CHECK-NEXT: and v29.16b, v1.16b, v9.16b
-; CHECK-NEXT: eor v23.16b, v23.16b, v8.16b
-; CHECK-NEXT: and v26.16b, v1.16b, v25.16b
+; CHECK-NEXT: xtn v10.4h, v10.4s
+; CHECK-NEXT: and v29.16b, v2.16b, v21.16b
+; CHECK-NEXT: eor v26.16b, v8.16b, v9.16b
+; CHECK-NEXT: and v9.16b, v1.16b, v4.16b
+; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT: eor v20.16b, v25.16b, v24.16b
+; CHECK-NEXT: and v25.16b, v1.16b, v5.16b
+; CHECK-NEXT: umull v28.4s, v0.4h, v28.4h
+; CHECK-NEXT: umull v30.4s, v0.4h, v30.4h
+; CHECK-NEXT: movi v24.4s, #64, lsl #8
; CHECK-NEXT: xtn v7.4h, v7.4s
-; CHECK-NEXT: and v1.16b, v1.16b, v28.16b
-; CHECK-NEXT: and v4.16b, v2.16b, v4.16b
+; CHECK-NEXT: eor v4.16b, v26.16b, v27.16b
+; CHECK-NEXT: and v26.16b, v1.16b, v6.16b
+; CHECK-NEXT: xtn v27.4h, v9.4s
+; CHECK-NEXT: xtn v25.4h, v25.4s
+; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
+; CHECK-NEXT: xtn v16.4h, v16.4s
; CHECK-NEXT: xtn v18.4h, v18.4s
-; CHECK-NEXT: xtn v20.4h, v20.4s
-; CHECK-NEXT: and v3.16b, v2.16b, v3.16b
-; CHECK-NEXT: and v21.16b, v2.16b, v21.16b
-; CHECK-NEXT: eor v23.16b, v23.16b, v27.16b
-; CHECK-NEXT: uzp1 v27.8h, v0.8h, v29.8h
-; CHECK-NEXT: eor v29.16b, v5.16b, v5.16b
-; CHECK-NEXT: uzp1 v26.8h, v0.8h, v26.8h
-; CHECK-NEXT: uzp1 v1.8h, v0.8h, v1.8h
-; CHECK-NEXT: xtn v4.4h, v4.4s
+; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
+; CHECK-NEXT: uzp1 v5.8h, v0.8h, v29.8h
+; CHECK-NEXT: xtn v26.4h, v26.4s
+; CHECK-NEXT: eor v28.16b, v28.16b, v30.16b
...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/184265
More information about the llvm-commits
mailing list