[llvm] [AArch64] Add @llvm.experimental.vector.match (PR #101974)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 30 11:52:48 PDT 2024
rj-jesus wrote:
Thanks very much for the review, I'll go through it tomorrow!
In the meantime, I've also got a version of the IR for the C snippet I sent through earlier (based on [this](https://github.com/llvm/llvm-project/pull/101976/files#diff-264ee34e53f2e411f474df9e3ddcda8c10c3dcc84b8bcb976f341b123c6eca4e) test case):
```llvm
define ptr @first_byte_of(ptr %0, ptr %1, ptr %2, ptr %3) {
%5 = icmp eq ptr %0, %1
%6 = icmp eq ptr %2, %3
%7 = or i1 %5, %6
br i1 %7, label %.loopexit1, label %.preheader
.preheader: ; preds = %4, %77
%pa = phi ptr [ %8, %77 ], [ %0, %4 ]
%8 = getelementptr i8, ptr %pa, i64 16
%9 = icmp ult ptr %8, %1
%10 = select i1 %9, ptr %8, ptr %1
%11 = ptrtoint ptr %pa to i64
%12 = ptrtoint ptr %10 to i64
%13 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %11, i64 %12)
%14 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %pa, i32 1, <vscale x 16 x i1> %13, <vscale x 16 x i8> zeroinitializer)
br label %15
15: ; preds = %76, %.preheader
%pb = phi ptr [ %2, %.preheader ], [ %16, %76 ]
%16 = getelementptr i8, ptr %pb, i64 16
%17 = icmp ult ptr %16, %3
%18 = select i1 %17, ptr %16, ptr %3
%19 = ptrtoint ptr %pb to i64
%20 = ptrtoint ptr %18 to i64
%21 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %19, i64 %20)
%22 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %pb, i32 1, <vscale x 16 x i1> %21, <vscale x 16 x i8> zeroinitializer)
%23 = extractelement <vscale x 16 x i8> %22, i64 0
%.splatinsert = insertelement <vscale x 16 x i8> poison, i8 %23, i64 0
%.splat = shufflevector <vscale x 16 x i8> %.splatinsert, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
%24 = select <vscale x 16 x i1> %21, <vscale x 16 x i8> %22, <vscale x 16 x i8> %.splat
; <<<<< Stock IR
%25 = extractelement <vscale x 16 x i8> %24, i64 0
%.splatinsert3 = insertelement <vscale x 16 x i8> poison, i8 %25, i64 0
%.splat4 = shufflevector <vscale x 16 x i8> %.splatinsert3, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
%26 = icmp eq <vscale x 16 x i8> %14, %.splat4
%27 = extractelement <vscale x 16 x i8> %24, i64 1
%.splatinsert5 = insertelement <vscale x 16 x i8> poison, i8 %27, i64 0
%.splat6 = shufflevector <vscale x 16 x i8> %.splatinsert5, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
%28 = icmp eq <vscale x 16 x i8> %14, %.splat6
%29 = or <vscale x 16 x i1> %26, %28
; the above 13 more times
%69 = extractelement <vscale x 16 x i8> %24, i64 15
%.splatinsert33 = insertelement <vscale x 16 x i8> poison, i8 %69, i64 0
%.splat34 = shufflevector <vscale x 16 x i8> %.splatinsert33, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
%70 = icmp eq <vscale x 16 x i8> %14, %.splat34
%71 = or <vscale x 16 x i1> %68, %70
%72 = and <vscale x 16 x i1> %71, %13
; ===== Or, with @llvm.experimental.vector.match
%25 = tail call <16 x i8> @llvm.vector.extract(<vscale x 16 x i8> %24, i64 0)
%72 = tail call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8(<vscale x 16 x i8> %14, <16 x i8> %25, <vscale x 16 x i1> %13)
; >>>>>
%73 = tail call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> %72)
br i1 %73, label %.loopexit, label %76
.loopexit: ; preds = %15
%74 = tail call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %72, i1 true)
%75 = getelementptr i8, ptr %pa, i64 %74
br label %.loopexit1
76: ; preds = %15
br i1 %17, label %15, label %77
77: ; preds = %76
br i1 %9, label %.preheader, label %.loopexit1
.loopexit1: ; preds = %77, %.loopexit, %4
%78 = phi ptr [ %1, %4 ], [ %75, %.loopexit ], [ %1, %77 ]
ret ptr %78
}
```
The matching part of the stock IR would lower to:
```asm
mov z2.b, b1
cmpeq p2.b, p0/z, z0.b, z2.b
mov z2.b, z1.b[1]
cmpeq p3.b, p0/z, z0.b, z2.b
mov z2.b, z1.b[2]
sel p2.b, p2, p2.b, p3.b
cmpeq p3.b, p0/z, z0.b, z2.b
mov z2.b, z1.b[3]
sel p2.b, p2, p2.b, p3.b
; ... about 48 instructions in total
```
Meanwhile, for the intrinsic version, we'd get the single MATCH instruction.
I can commit these two versions to #101976 if you'd like to give them a go yourself? Just let me know if that would be useful, otherwise I'll do it in one go after going through your feedback.
Thanks again!
https://github.com/llvm/llvm-project/pull/101974
More information about the llvm-commits
mailing list