[llvm] [AArch64] Add @llvm.experimental.vector.match (PR #101974)

Wed Oct 30 11:52:48 PDT 2024

rj-jesus wrote:

Thanks very much for the review, I'll go through it tomorrow!

In the meantime, I've also got a version of the IR for the C snippet I sent through earlier (based on [this](https://github.com/llvm/llvm-project/pull/101976/files#diff-264ee34e53f2e411f474df9e3ddcda8c10c3dcc84b8bcb976f341b123c6eca4e) test case):
```llvm
define ptr @first_byte_of(ptr %0, ptr %1, ptr %2, ptr %3) {
  %5 = icmp eq ptr %0, %1
  %6 = icmp eq ptr %2, %3
  %7 = or i1 %5, %6
  br i1 %7, label %.loopexit1, label %.preheader

.preheader:                                       ; preds = %4, %77
  %pa = phi ptr [ %8, %77 ], [ %0, %4 ]
  %8 = getelementptr i8, ptr %pa, i64 16
  %9 = icmp ult ptr %8, %1
  %10 = select i1 %9, ptr %8, ptr %1
  %11 = ptrtoint ptr %pa to i64
  %12 = ptrtoint ptr %10 to i64
  %13 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %11, i64 %12)
  %14 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %pa, i32 1, <vscale x 16 x i1> %13, <vscale x 16 x i8> zeroinitializer)
  br label %15

15:                                               ; preds = %76, %.preheader
  %pb = phi ptr [ %2, %.preheader ], [ %16, %76 ]
  %16 = getelementptr i8, ptr %pb, i64 16
  %17 = icmp ult ptr %16, %3
  %18 = select i1 %17, ptr %16, ptr %3
  %19 = ptrtoint ptr %pb to i64
  %20 = ptrtoint ptr %18 to i64
  %21 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %19, i64 %20)
  %22 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %pb, i32 1, <vscale x 16 x i1> %21, <vscale x 16 x i8> zeroinitializer)
  %23 = extractelement <vscale x 16 x i8> %22, i64 0
  %.splatinsert = insertelement <vscale x 16 x i8> poison, i8 %23, i64 0
  %.splat = shufflevector <vscale x 16 x i8> %.splatinsert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
  %24 = select <vscale x 16 x i1> %21, <vscale x 16 x i8> %22, <vscale x 16 x i8> %.splat

  ; <<<<< Stock IR

  %25 = extractelement <vscale x 16 x i8> %24, i64 0
  %.splatinsert3 = insertelement <vscale x 16 x i8> poison, i8 %25, i64 0
  %.splat4 = shufflevector <vscale x 16 x i8> %.splatinsert3, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
  %26 = icmp eq <vscale x 16 x i8> %14, %.splat4

  %27 = extractelement <vscale x 16 x i8> %24, i64 1
  %.splatinsert5 = insertelement <vscale x 16 x i8> poison, i8 %27, i64 0
  %.splat6 = shufflevector <vscale x 16 x i8> %.splatinsert5, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
  %28 = icmp eq <vscale x 16 x i8> %14, %.splat6
  %29 = or <vscale x 16 x i1> %26, %28

  ; the above 13 more times

  %69 = extractelement <vscale x 16 x i8> %24, i64 15
  %.splatinsert33 = insertelement <vscale x 16 x i8> poison, i8 %69, i64 0
  %.splat34 = shufflevector <vscale x 16 x i8> %.splatinsert33, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
  %70 = icmp eq <vscale x 16 x i8> %14, %.splat34
  %71 = or <vscale x 16 x i1> %68, %70

  %72 = and <vscale x 16 x i1> %71, %13

  ; ===== Or, with @llvm.experimental.vector.match

  %25 = tail call <16 x i8> @llvm.vector.extract(<vscale x 16 x i8> %24, i64 0)
  %72 = tail call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8(<vscale x 16 x i8> %14, <16 x i8> %25, <vscale x 16 x i1> %13)

  ; >>>>>

  %73 = tail call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> %72)
  br i1 %73, label %.loopexit, label %76

.loopexit:                                        ; preds = %15
  %74 = tail call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %72, i1 true)
  %75 = getelementptr i8, ptr %pa, i64 %74
  br label %.loopexit1

76:                                               ; preds = %15
  br i1 %17, label %15, label %77

77:                                               ; preds = %76
  br i1 %9, label %.preheader, label %.loopexit1

.loopexit1:                                       ; preds = %77, %.loopexit, %4
  %78 = phi ptr [ %1, %4 ], [ %75, %.loopexit ], [ %1, %77 ]
  ret ptr %78
}
```

The matching part of the stock IR would lower to:
```asm
	mov	z2.b, b1
	cmpeq	p2.b, p0/z, z0.b, z2.b
	mov	z2.b, z1.b[1]
	cmpeq	p3.b, p0/z, z0.b, z2.b
	mov	z2.b, z1.b[2]
	sel	p2.b, p2, p2.b, p3.b
	cmpeq	p3.b, p0/z, z0.b, z2.b
	mov	z2.b, z1.b[3]
	sel	p2.b, p2, p2.b, p3.b
        ; ... about 48 instructions in total
```

Meanwhile, for the intrinsic version, we'd get the single MATCH instruction.

I can commit these two versions to #101976 if you'd like to give them a go yourself? Just let me know if that would be useful, otherwise I'll do it in one go after going through your feedback.

Thanks again!

https://github.com/llvm/llvm-project/pull/101974