[llvm] [RISCV] Promote i8/i16/i32 scalable vector CLMUL to i64 CLMUL with Zvbc. (PR #184265)

via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 2 15:46:15 PST 2026


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Craig Topper (topperc)

<details>
<summary>Changes</summary>

This handles the simple case where we can widen to i64 vector
without splitting. More work will be done in follow ups.

Stacked on #<!-- -->184257

---

Patch is 4.54 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/184265.diff


11 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp (+3) 
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+24-2) 
- (modified) llvm/test/CodeGen/AArch64/clmul-fixed.ll (+1085-1271) 
- (modified) llvm/test/CodeGen/AArch64/clmul-scalable.ll (+213-420) 
- (modified) llvm/test/CodeGen/PowerPC/clmul-vector.ll (+3580-2824) 
- (modified) llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll (+5535-3621) 
- (modified) llvm/test/CodeGen/RISCV/rvv/clmulh-sdnode.ll (+37820-47041) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll (+384-384) 
- (modified) llvm/test/CodeGen/X86/clmul-vector-256.ll (+882-861) 
- (modified) llvm/test/CodeGen/X86/clmul-vector-512.ll (+1140-1060) 
- (modified) llvm/test/CodeGen/X86/clmul-vector.ll (+2722-2729) 


``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 0b1d5bfd078d8..74fe5c5819982 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -389,6 +389,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTPOP:
+  case ISD::CLMUL:
+  case ISD::CLMULH:
+  case ISD::CLMULR:
   case ISD::SELECT:
   case ISD::VSELECT:
   case ISD::SELECT_CC:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a8542be937a87..2bdffad2ded26 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1116,8 +1116,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         }
       }
 
-      if (Subtarget.hasStdExtZvbc() && VT.getVectorElementType() == MVT::i64)
-        setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
+      if (Subtarget.hasStdExtZvbc() && Subtarget.hasVInstructionsI64()) {
+        if (VT.getVectorElementType() == MVT::i64)
+          setOperationAction({ISD::CLMUL, ISD::CLMULH}, VT, Legal);
+        else {
+          // Promote to i64 if the lmul is small enough.
+          // FIXME: Split if necessary to widen.
+          // FIXME: Promote clmulh directly without legalizing to clmul first.
+          MVT I64VecVT = MVT::getVectorVT(MVT::i64, VT.getVectorElementCount());
+          if (isTypeLegal(I64VecVT))
+            setOperationAction(ISD::CLMUL, VT, Custom);
+        }
+      }
 
       setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
     }
@@ -8920,6 +8930,18 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       return lowerToScalableOp(Op, DAG);
     assert(Op.getOpcode() != ISD::CTTZ);
     return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
+  case ISD::CLMUL: {
+    assert(Op.getValueType().isScalableVector() && Subtarget.hasStdExtZvbc() &&
+           "Unexpected custom legalisation");
+    // Promote to i64 vector.
+    MVT VT = Op.getSimpleValueType();
+    MVT I64VecVT = MVT::getVectorVT(MVT::i64, VT.getVectorElementCount());
+    SDLoc DL(Op);
+    SDValue Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, I64VecVT, Op.getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, I64VecVT, Op.getOperand(1));
+    SDValue CLMUL = DAG.getNode(ISD::CLMUL, DL, I64VecVT, Op0, Op1);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, CLMUL);
+  }
   case ISD::FCOPYSIGN:
     if (Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16)
       return lowerFCOPYSIGN(Op, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
index 8205d6c80221d..23692dc456fc2 100644
--- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
@@ -1730,23 +1730,23 @@ define <1 x i128> @clmul_v1i128_neon(<1 x i128> %x, <1 x i128> %y) {
 define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: clmul_v8i16_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.8h, #2
-; CHECK-NEXT:    movi v3.8h, #1
-; CHECK-NEXT:    movi v4.8h, #4
-; CHECK-NEXT:    movi v5.8h, #8
-; CHECK-NEXT:    movi v6.8h, #16
-; CHECK-NEXT:    movi v7.8h, #32
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    movi v16.8h, #64
-; CHECK-NEXT:    movi v17.8h, #128
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v17.16b
+; CHECK-NEXT:    mov v2.16b, v1.16b
+; CHECK-NEXT:    mov v3.16b, v1.16b
+; CHECK-NEXT:    mov v4.16b, v1.16b
+; CHECK-NEXT:    mov v5.16b, v1.16b
+; CHECK-NEXT:    mov v6.16b, v1.16b
+; CHECK-NEXT:    mov v7.16b, v1.16b
+; CHECK-NEXT:    mov v16.16b, v1.16b
+; CHECK-NEXT:    bic v1.8h, #127
+; CHECK-NEXT:    bic v2.8h, #253
+; CHECK-NEXT:    bic v3.8h, #254
+; CHECK-NEXT:    bic v4.8h, #251
+; CHECK-NEXT:    bic v5.8h, #247
+; CHECK-NEXT:    bic v6.8h, #239
+; CHECK-NEXT:    bic v7.8h, #223
+; CHECK-NEXT:    bic v16.8h, #191
+; CHECK-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-NEXT:    xtn v2.8b, v2.8h
 ; CHECK-NEXT:    xtn v3.8b, v3.8h
 ; CHECK-NEXT:    xtn v4.8b, v4.8h
@@ -1754,7 +1754,6 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-NEXT:    xtn v6.8b, v6.8h
 ; CHECK-NEXT:    xtn v7.8b, v7.8h
 ; CHECK-NEXT:    xtn v16.8b, v16.8h
-; CHECK-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-NEXT:    umull v2.8h, v0.8b, v2.8b
 ; CHECK-NEXT:    umull v3.8h, v0.8b, v3.8b
 ; CHECK-NEXT:    umull v4.8h, v0.8b, v4.8b
@@ -1780,89 +1779,84 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: clmul_v16i16_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v4.8h, #2
-; CHECK-NEXT:    ushll v2.8h, v1.8b, #0
-; CHECK-NEXT:    movi v5.8h, #1
-; CHECK-NEXT:    movi v6.8h, #4
-; CHECK-NEXT:    movi v7.8h, #8
-; CHECK-NEXT:    movi v17.8h, #16
-; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT:    movi v18.8h, #32
-; CHECK-NEXT:    movi v1.8h, #128
-; CHECK-NEXT:    movi v19.8h, #64
-; CHECK-NEXT:    movi v25.2d, #0000000000000000
-; CHECK-NEXT:    and v16.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v20.16b, v2.16b, v5.16b
-; CHECK-NEXT:    and v21.16b, v2.16b, v6.16b
-; CHECK-NEXT:    and v22.16b, v2.16b, v7.16b
-; CHECK-NEXT:    and v4.16b, v3.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v3.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v3.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v3.16b, v7.16b
-; CHECK-NEXT:    and v23.16b, v3.16b, v17.16b
-; CHECK-NEXT:    and v24.16b, v3.16b, v18.16b
-; CHECK-NEXT:    and v26.16b, v3.16b, v1.16b
-; CHECK-NEXT:    and v17.16b, v2.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v2.16b, v18.16b
+; CHECK-NEXT:    ushll2 v2.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    mov v4.16b, v2.16b
+; CHECK-NEXT:    mov v5.16b, v2.16b
+; CHECK-NEXT:    mov v6.16b, v2.16b
+; CHECK-NEXT:    mov v7.16b, v2.16b
+; CHECK-NEXT:    mov v16.16b, v2.16b
+; CHECK-NEXT:    mov v17.16b, v2.16b
+; CHECK-NEXT:    mov v18.16b, v1.16b
+; CHECK-NEXT:    mov v19.16b, v1.16b
+; CHECK-NEXT:    mov v20.16b, v1.16b
+; CHECK-NEXT:    mov v21.16b, v1.16b
+; CHECK-NEXT:    mov v22.16b, v1.16b
+; CHECK-NEXT:    mov v23.16b, v1.16b
+; CHECK-NEXT:    bic v4.8h, #253
+; CHECK-NEXT:    bic v5.8h, #254
+; CHECK-NEXT:    bic v6.8h, #251
+; CHECK-NEXT:    bic v7.8h, #247
+; CHECK-NEXT:    mov v3.16b, v2.16b
+; CHECK-NEXT:    bic v16.8h, #239
+; CHECK-NEXT:    bic v17.8h, #223
+; CHECK-NEXT:    bic v18.8h, #253
+; CHECK-NEXT:    bic v19.8h, #254
+; CHECK-NEXT:    bic v20.8h, #251
+; CHECK-NEXT:    bic v21.8h, #247
+; CHECK-NEXT:    bic v22.8h, #239
+; CHECK-NEXT:    bic v23.8h, #223
+; CHECK-NEXT:    mov v24.16b, v1.16b
 ; CHECK-NEXT:    uzp1 v4.16b, v0.16b, v4.16b
 ; CHECK-NEXT:    uzp1 v5.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    uzp1 v6.16b, v0.16b, v6.16b
 ; CHECK-NEXT:    uzp1 v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    uzp1 v23.16b, v0.16b, v23.16b
-; CHECK-NEXT:    uzp1 v24.16b, v0.16b, v24.16b
-; CHECK-NEXT:    and v3.16b, v3.16b, v19.16b
-; CHECK-NEXT:    uzp1 v26.16b, v0.16b, v26.16b
-; CHECK-NEXT:    uzp1 v25.16b, v0.16b, v25.16b
-; CHECK-NEXT:    xtn v16.8b, v16.8h
+; CHECK-NEXT:    bic v3.8h, #191
+; CHECK-NEXT:    uzp1 v16.16b, v0.16b, v16.16b
+; CHECK-NEXT:    uzp1 v17.16b, v0.16b, v17.16b
+; CHECK-NEXT:    xtn v18.8b, v18.8h
+; CHECK-NEXT:    xtn v19.8b, v19.8h
 ; CHECK-NEXT:    xtn v20.8b, v20.8h
 ; CHECK-NEXT:    xtn v21.8b, v21.8h
 ; CHECK-NEXT:    xtn v22.8b, v22.8h
-; CHECK-NEXT:    xtn v17.8b, v17.8h
-; CHECK-NEXT:    xtn v18.8b, v18.8h
-; CHECK-NEXT:    and v19.16b, v2.16b, v19.16b
-; CHECK-NEXT:    uzp1 v3.16b, v0.16b, v3.16b
+; CHECK-NEXT:    xtn v23.8b, v23.8h
+; CHECK-NEXT:    bic v24.8h, #191
 ; CHECK-NEXT:    umull2 v4.8h, v0.16b, v4.16b
 ; CHECK-NEXT:    umull2 v5.8h, v0.16b, v5.16b
 ; CHECK-NEXT:    umull2 v6.8h, v0.16b, v6.16b
 ; CHECK-NEXT:    umull2 v7.8h, v0.16b, v7.16b
-; CHECK-NEXT:    umull2 v23.8h, v0.16b, v23.16b
-; CHECK-NEXT:    umull2 v24.8h, v0.16b, v24.16b
-; CHECK-NEXT:    umull2 v26.8h, v0.16b, v26.16b
-; CHECK-NEXT:    umull2 v25.8h, v0.16b, v25.16b
-; CHECK-NEXT:    xtn v19.8b, v19.8h
-; CHECK-NEXT:    umull v16.8h, v0.8b, v16.8b
+; CHECK-NEXT:    uzp1 v3.16b, v0.16b, v3.16b
+; CHECK-NEXT:    umull2 v16.8h, v0.16b, v16.16b
+; CHECK-NEXT:    umull2 v17.8h, v0.16b, v17.16b
+; CHECK-NEXT:    umull v18.8h, v0.8b, v18.8b
+; CHECK-NEXT:    xtn v24.8b, v24.8h
+; CHECK-NEXT:    umull v19.8h, v0.8b, v19.8b
 ; CHECK-NEXT:    umull v20.8h, v0.8b, v20.8b
 ; CHECK-NEXT:    umull v21.8h, v0.8b, v21.8b
 ; CHECK-NEXT:    umull v22.8h, v0.8b, v22.8b
-; CHECK-NEXT:    umull v17.8h, v0.8b, v17.8b
-; CHECK-NEXT:    umull v18.8h, v0.8b, v18.8b
-; CHECK-NEXT:    umull2 v3.8h, v0.16b, v3.16b
+; CHECK-NEXT:    umull v23.8h, v0.8b, v23.8b
+; CHECK-NEXT:    bic v2.8h, #127
+; CHECK-NEXT:    bic v1.8h, #127
 ; CHECK-NEXT:    eor v4.16b, v5.16b, v4.16b
 ; CHECK-NEXT:    eor v5.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v6.16b, v23.16b, v24.16b
-; CHECK-NEXT:    eor v7.16b, v26.16b, v25.16b
-; CHECK-NEXT:    eor v23.16b, v25.16b, v25.16b
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-NEXT:    umull v2.8h, v0.8b, v19.8b
-; CHECK-NEXT:    eor v16.16b, v20.16b, v16.16b
-; CHECK-NEXT:    eor v19.16b, v21.16b, v22.16b
-; CHECK-NEXT:    eor v17.16b, v17.16b, v18.16b
+; CHECK-NEXT:    umull2 v3.8h, v0.16b, v3.16b
+; CHECK-NEXT:    eor v6.16b, v16.16b, v17.16b
+; CHECK-NEXT:    umull v7.8h, v0.8b, v24.8b
+; CHECK-NEXT:    eor v16.16b, v19.16b, v18.16b
+; CHECK-NEXT:    eor v17.16b, v20.16b, v21.16b
+; CHECK-NEXT:    eor v18.16b, v22.16b, v23.16b
+; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
 ; CHECK-NEXT:    eor v3.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v5.16b, v7.16b, v25.16b
-; CHECK-NEXT:    eor v6.16b, v23.16b, v25.16b
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    eor v7.16b, v16.16b, v19.16b
-; CHECK-NEXT:    eor v2.16b, v17.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v25.16b
-; CHECK-NEXT:    eor v5.16b, v6.16b, v25.16b
+; CHECK-NEXT:    eor v5.16b, v16.16b, v17.16b
+; CHECK-NEXT:    eor v6.16b, v18.16b, v7.16b
+; CHECK-NEXT:    umull2 v2.8h, v0.16b, v2.16b
 ; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    eor v1.16b, v7.16b, v2.16b
-; CHECK-NEXT:    eor v2.16b, v3.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v5.16b, v25.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    eor v1.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v5.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
 ; CHECK-NEXT:    ret
   %zextx = zext <16 x i8> %x to <16 x i16>
   %zexty = zext <16 x i8> %y to <16 x i16>
@@ -1963,168 +1957,162 @@ define <4 x i32> @clmul_v4i32_neon_zext(<4 x i16> %x, <4 x i16> %y) {
 define <8 x i32> @clmul_v8i32_neon_zext(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: clmul_v8i32_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d9, d8, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    str d12, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    .cfi_offset b8, -8
 ; CHECK-NEXT:    .cfi_offset b9, -16
-; CHECK-NEXT:    movi v4.4s, #2
-; CHECK-NEXT:    movi v5.4s, #1
-; CHECK-NEXT:    movi v6.4s, #4
-; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NEXT:    movi v3.4s, #8
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    movi v20.4s, #16
-; CHECK-NEXT:    movi v21.4s, #32
-; CHECK-NEXT:    and v17.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v7.16b, v2.16b, v5.16b
-; CHECK-NEXT:    and v16.16b, v2.16b, v6.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v2.16b, v3.16b
-; CHECK-NEXT:    and v24.16b, v1.16b, v20.16b
-; CHECK-NEXT:    xtn v17.4h, v17.4s
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    xtn v19.4h, v16.4s
-; CHECK-NEXT:    uzp1 v4.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uzp1 v22.8h, v0.8h, v5.8h
-; CHECK-NEXT:    uzp1 v23.8h, v0.8h, v6.8h
-; CHECK-NEXT:    uzp1 v18.8h, v0.8h, v18.8h
-; CHECK-NEXT:    and v25.16b, v1.16b, v21.16b
-; CHECK-NEXT:    movi v6.4s, #128
-; CHECK-NEXT:    uzp1 v24.8h, v0.8h, v24.8h
-; CHECK-NEXT:    and v28.16b, v2.16b, v20.16b
-; CHECK-NEXT:    and v21.16b, v2.16b, v21.16b
-; CHECK-NEXT:    umull v5.4s, v0.4h, v17.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v17.4s, v0.4h, v19.4h
-; CHECK-NEXT:    xtn v19.4h, v3.4s
+; CHECK-NEXT:    .cfi_offset b10, -24
+; CHECK-NEXT:    .cfi_offset b11, -32
+; CHECK-NEXT:    .cfi_offset b12, -48
+; CHECK-NEXT:    movi v19.4s, #2
+; CHECK-NEXT:    movi v21.4s, #1
+; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-NEXT:    movi v17.4s, #4
+; CHECK-NEXT:    movi v20.4s, #8
+; CHECK-NEXT:    movi v5.4s, #16
+; CHECK-NEXT:    movi v4.4s, #32
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    and v3.16b, v2.16b, v19.16b
+; CHECK-NEXT:    and v6.16b, v2.16b, v21.16b
+; CHECK-NEXT:    and v7.16b, v2.16b, v17.16b
+; CHECK-NEXT:    and v16.16b, v2.16b, v20.16b
+; CHECK-NEXT:    and v18.16b, v2.16b, v5.16b
+; CHECK-NEXT:    and v22.16b, v2.16b, v4.16b
+; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
+; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
+; CHECK-NEXT:    uzp1 v23.8h, v0.8h, v3.8h
 ; CHECK-NEXT:    movi v3.4s, #64
+; CHECK-NEXT:    uzp1 v24.8h, v0.8h, v6.8h
+; CHECK-NEXT:    movi v6.4s, #128
+; CHECK-NEXT:    uzp1 v25.8h, v0.8h, v7.8h
 ; CHECK-NEXT:    movi v7.4s, #1, lsl #8
-; CHECK-NEXT:    umull2 v26.4s, v0.8h, v4.8h
-; CHECK-NEXT:    umull2 v22.4s, v0.8h, v22.8h
-; CHECK-NEXT:    umull2 v23.4s, v0.8h, v23.8h
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v18.8h
-; CHECK-NEXT:    uzp1 v25.8h, v0.8h, v25.8h
-; CHECK-NEXT:    movi v4.4s, #2, lsl #8
-; CHECK-NEXT:    and v30.16b, v1.16b, v6.16b
-; CHECK-NEXT:    movi v18.4s, #8, lsl #8
-; CHECK-NEXT:    movi v20.4s, #16, lsl #8
-; CHECK-NEXT:    and v29.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v31.16b, v1.16b, v7.16b
+; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v16.8h
+; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v18.8h
+; CHECK-NEXT:    uzp1 v28.8h, v0.8h, v22.8h
+; CHECK-NEXT:    movi v16.4s, #8, lsl #8
+; CHECK-NEXT:    movi v18.4s, #16, lsl #8
+; CHECK-NEXT:    movi v22.4s, #2, lsl #8
+; CHECK-NEXT:    umull2 v29.4s, v0.8h, v23.8h
+; CHECK-NEXT:    and v23.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    umull2 v24.4s, v0.8h, v24.8h
-; CHECK-NEXT:    eor v22.16b, v22.16b, v26.16b
-; CHECK-NEXT:    xtn v28.4h, v28.4s
-; CHECK-NEXT:    umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT:    eor v23.16b, v23.16b, v27.16b
+; CHECK-NEXT:    and v30.16b, v2.16b, v6.16b
+; CHECK-NEXT:    and v31.16b, v2.16b, v7.16b
 ; CHECK-NEXT:    umull2 v25.4s, v0.8h, v25.8h
-; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v30.8h
-; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v29.8h
-; CHECK-NEXT:    uzp1 v29.8h, v0.8h, v31.8h
-; CHECK-NEXT:    and v30.16b, v1.16b, v4.16b
-; CHECK-NEXT:    xtn v31.4h, v21.4s
-; CHECK-NEXT:    movi v21.4s, #32, lsl #8
-; CHECK-NEXT:    and v8.16b, v1.16b, v20.16b
-; CHECK-NEXT:    eor v22.16b, v22.16b, v23.16b
-; CHECK-NEXT:    and v23.16b, v1.16b, v18.16b
-; CHECK-NEXT:    umull v28.4s, v0.4h, v28.4h
-; CHECK-NEXT:    eor v24.16b, v24.16b, v25.16b
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v27.8h
-; CHECK-NEXT:    eor v16.16b, v16.16b, v5.16b
-; CHECK-NEXT:    umull2 v25.4s, v0.8h, v26.8h
-; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v30.8h
-; CHECK-NEXT:    umull2 v29.4s, v0.8h, v29.8h
-; CHECK-NEXT:    movi v30.2d, #0000000000000000
-; CHECK-NEXT:    uzp1 v23.8h, v0.8h, v23.8h
-; CHECK-NEXT:    uzp1 v8.8h, v0.8h, v8.8h
-; CHECK-NEXT:    and v9.16b, v1.16b, v21.16b
-; CHECK-NEXT:    umull v31.4s, v0.4h, v31.4h
-; CHECK-NEXT:    eor v17.16b, v17.16b, v19.16b
-; CHECK-NEXT:    and v6.16b, v2.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v2.16b, v7.16b
-; CHECK-NEXT:    and v18.16b, v2.16b, v18.16b
 ; CHECK-NEXT:    umull2 v26.4s, v0.8h, v26.8h
-; CHECK-NEXT:    eor v27.16b, v27.16b, v29.16b
-; CHECK-NEXT:    eor v24.16b, v24.16b, v25.16b
-; CHECK-NEXT:    uzp1 v29.8h, v0.8h, v9.8h
+; CHECK-NEXT:    umull2 v27.4s, v0.8h, v27.8h
+; CHECK-NEXT:    umull2 v28.4s, v0.8h, v28.8h
+; CHECK-NEXT:    uzp1 v10.8h, v0.8h, v23.8h
+; CHECK-NEXT:    movi v23.4s, #32, lsl #8
+; CHECK-NEXT:    and v8.16b, v2.16b, v16.16b
+; CHECK-NEXT:    and v9.16b, v2.16b, v18.16b
 ; CHECK-NEXT:    uzp1 v30.8h, v0.8h, v30.8h
-; CHECK-NEXT:    movi v9.4s, #64, lsl #8
-; CHECK-NEXT:    umull2 v23.4s, v0.8h, v23.8h
+; CHECK-NEXT:    uzp1 v31.8h, v0.8h, v31.8h
+; CHECK-NEXT:    and v11.16b, v2.16b, v22.16b
+; CHECK-NEXT:    eor v24.16b, v24.16b, v29.16b
+; CHECK-NEXT:    xtn v12.4h, v19.4s
+; CHECK-NEXT:    uzp1 v8.8h, v0.8h, v8.8h
+; CHECK-NEXT:    eor v25.16b, v25.16b, v26.16b
+; CHECK-NEXT:    eor v26.16b, v27.16b, v28.16b
+; CHECK-NEXT:    uzp1 v9.8h, v0.8h, v9.8h
+; CHECK-NEXT:    and v29.16b, v2.16b, v23.16b
+; CHECK-NEXT:    umull2 v27.4s, v0.8h, v10.8h
+; CHECK-NEXT:    umull2 v28.4s, v0.8h, v30.8h
+; CHECK-NEXT:    uzp1 v30.8h, v0.8h, v11.8h
+; CHECK-NEXT:    umull2 v31.4s, v0.8h, v31.8h
+; CHECK-NEXT:    and v11.16b, v1.16b, v17.16b
+; CHECK-NEXT:    eor v17.16b, v24.16b, v25.16b
+; CHECK-NEXT:    and v10.16b, v1.16b, v21.16b
+; CHECK-NEXT:    uzp1 v29.8h, v0.8h, v29.8h
 ; CHECK-NEXT:    umull2 v8.4s, v0.8h, v8.8h
-; CHECK-NEXT:    movi v25.4s, #4, lsl #8
-; CHECK-NEXT:    eor v22.16b, v22.16b, v24.16b
-; CHECK-NEXT:    eor v19.16b, v28.16b, v31.16b
-; CHECK-NEXT:    movi v28.4s, #128, lsl #8
-; CHECK-NEXT:    eor v24.16b, v27.16b, v26.16b
-; CHECK-NEXT:    and v20.16b, v2.16b, v20.16b
-; CHECK-NEXT:    xtn v6.4h, v6.4s
+; CHECK-NEXT:    movi v21.4s, #4, lsl #8
+; CHECK-NEXT:    umull2 v9.4s, v0.8h, v9.8h
+; CHECK-NEXT:    eor v19.16b, v26.16b, v27.16b
+; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
+; CHECK-NEXT:    umull2 v24.4s, v0.8h, v30.8h
+; CHECK-NEXT:    eor v25.16b, v28.16b, v31.16b
+; CHECK-NEXT:    xtn v28.4h, v11.4s
+; CHECK-NEXT:    xtn v30.4h, v20.4s
+; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
+; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
 ; CHECK-NEXT:    umull2 v27.4s, v0.8h, v29.8h
-; CHECK-NEXT:    umull2 v5.4s, v0.8h, v30.8h
-; CHECK-NEXT:    and v29.16b, v1.16b, v9.16b
-; CHECK-NEXT:    eor v23.16b, v23.16b, v8.16b
-; CHECK-NEXT:    and v26.16b, v1.16b, v25.16b
+; CHECK-NEXT:    xtn v10.4h, v10.4s
+; CHECK-NEXT:    and v29.16b, v2.16b, v21.16b
+; CHECK-NEXT:    eor v26.16b, v8.16b, v9.16b
+; CHECK-NEXT:    and v9.16b, v1.16b, v4.16b
+; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
+; CHECK-NEXT:    eor v20.16b, v25.16b, v24.16b
+; CHECK-NEXT:    and v25.16b, v1.16b, v5.16b
+; CHECK-NEXT:    umull v28.4s, v0.4h, v28.4h
+; CHECK-NEXT:    umull v30.4s, v0.4h, v30.4h
+; CHECK-NEXT:    movi v24.4s, #64, lsl #8
 ; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    and v1.16b, v1.16b, v28.16b
-; CHECK-NEXT:    and v4.16b, v2.16b, v4.16b
+; CHECK-NEXT:    eor v4.16b, v26.16b, v27.16b
+; CHECK-NEXT:    and v26.16b, v1.16b, v6.16b
+; CHECK-NEXT:    xtn v27.4h, v9.4s
+; CHECK-NEXT:    xtn v25.4h, v25.4s
+; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
+; CHECK-NEXT:    xtn v16.4h, v16.4s
 ; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    xtn v20.4h, v20.4s
-; CHECK-NEXT:    and v3.16b, v2.16b, v3.16b
-; CHECK-NEXT:    and v21.16b, v2.16b, v21.16b
-; CHECK-NEXT:    eor v23.16b, v23.16b, v27.16b
-; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v29.8h
-; CHECK-NEXT:    eor v29.16b, v5.16b, v5.16b
-; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v26.8h
-; CHECK-NEXT:    uzp1 v1.8h, v0.8h, v1.8h
-; CHECK-NEXT:    xtn v4.4h, v4.4s
+; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
+; CHECK-NEXT:    uzp1 v5.8h, v0.8h, v29.8h
+; CHECK-NEXT:    xtn v26.4h, v26.4s
+; CHECK-NEXT:    eor v28.16b, v28.16b, v30.16b
...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/184265


More information about the llvm-commits mailing list