[llvm] [AArch64] Add MATCH loops to LoopIdiomVectorizePass (PR #101976)

Thu Nov 21 01:21:10 PST 2024

rj-jesus wrote:

Hi @david-arm and @paulwalker-arm, I've rebased this patch to use the version of `@llvm.experimental.vector.match` committed last week. I've also added a simple cost model which currently only lets the transformation kick in for supported SVE vectors.

Currently, a simple double find loop like the one in the description should result in something like:
```gas
find_first_of_i8:                       // @find_first_of_i8
	.cfi_startproc
// %bb.0:
	cmp	x0, x1
	b.eq	.LBB0_7
// %bb.1:
	cmp	x2, x3
	b.eq	.LBB0_7
// %bb.2:                               // %.preheader
	ptrue	p0.b, vl16
.LBB0_3:                                // =>This Loop Header: Depth=1
                                        //     Child Loop BB0_4 Depth 2
	whilelo	p1.b, x0, x1
	mov	x8, x2
	mov	x9, x2
	and	p1.b, p0/z, p0.b, p1.b
	ld1b	{ z0.b }, p1/z, [x0]
.LBB0_4:                                //   Parent Loop BB0_3 Depth=1
                                        // =>  This Inner Loop Header: Depth=2
	whilelo	p2.b, x8, x3
	and	p2.b, p0/z, p0.b, p2.b
	ld1b	{ z1.b }, p2/z, [x9]
	mov	z2.b, b1
	sel	z1.b, p2, z1.b, z2.b
	mov	z1.q, q1
	match	p2.b, p1/z, z0.b, z1.b
	b.ne	.LBB0_8
// %bb.5:                               //   in Loop: Header=BB0_4 Depth=2
	add	x9, x9, #16
	add	x8, x8, #16
	cmp	x9, x3
	b.lo	.LBB0_4
// %bb.6:                               //   in Loop: Header=BB0_3 Depth=1
	add	x0, x0, #16
	cmp	x0, x1
	b.lo	.LBB0_3
.LBB0_7:                                // %.loopexit1
	mov	x0, x1
	ret
.LBB0_8:                                // %.loopexit
	ptrue	p0.b
	brkb	p0.b, p0/z, p2.b
	incp	x0, p0.b
	ret
```

Could you please let me know if you have any thoughts or suggestions? Many thanks in advance!

https://github.com/llvm/llvm-project/pull/101976