[llvm] [CodeLayout] Size-aware machine block placement (PR #109711)

Mon Oct 14 16:00:30 PDT 2024

nocchijiang wrote:

> Can you elaborate on this? I don't see a size regression in the provided example.

If there are multiple traps in a single function, they will be laid out at the end of it which looks like this:

```assembly
...
ret
brk #1
brk #1
brk #1
brk #1
brk #1
brk #1
; end-of-function
```

With `ApplyExtTspForSize` enabled, they will no longer be "grouped" together. As compression algorithm loves similar inputs I *guess* the compression ratio would take a hit. But you can definitely say that it is a wild guess. Actually I noticed a real code size regression caused by falling through to the traps (i.e. cold blocks) yesterday, so this is more straightforward to illustrate as a potential chance of improvement.

Good:
```assembly
10df62ab4:     	mov	x5, x30
10df62ab8:     	bl	0x10df7aa60
10df62abc:     	mov	x30, x5
10df62ac0:     	stp	x29, x30, [sp, #64]
10df62ac4:     	add	x29, sp, #64
10df62ac8:     	bl	0x107c73c6c
10df62acc:     	mov	x21, x0
10df62ad0:     	tbz	w2, #0, 0x10df62aec
10df62ad4:     	bl	0x107c749d4
10df62ad8:     	b.ge	0x10df62af0
10df62adc:     	bl	0x107c749d0
10df62ae0:     	b.mi	0x10df62b78
10df62ae4:     	bl	0x107c749cc
10df62ae8:     	b	0x10df62af0
10df62aec:     	mov	x8, x1
10df62af0:     	bl	0x107c749c8
10df62af4:     	cbz	x23, 0x10df62b1c
10df62af8:     	adrp	x0, 0x115984000
10df62afc:     	add	x0, x0, #3328
10df62b00:     	bl	0x107c24cb0
10df62b04:     	lsl	x8, x23, #4
10df62b08:     	add	x1, x8, #32
10df62b0c:     	bl	0x107c73850
10df62b10:     	bl	0x107c749c4
10df62b14:     	bl	0x108ac85d8
10df62b18:     	b	0x10df62b28
10df62b1c:     	adrp	x23, 0x111b71000
10df62b20:     	ldr	x23, [x23, #1616]
10df62b24:     	bl	0x107c73d1c
10df62b28:     	add	x24, x23, #32
10df62b2c:     	add	x25, x20, #32
10df62b30:     	tbz	w21, #0, 0x10df62b58
10df62b34:     	add	x8, x25, x22, lsl #4
10df62b38:     	cmp	x23, x20
10df62b3c:     	ccmp	x8, x24, #0, eq
10df62b40:     	b.hi	0x10df62b50
10df62b44:     	lsl	x2, x22, #4
10df62b48:     	bl	0x107c73ae8
10df62b4c:     	bl	0x10fea94a4
10df62b50:     	str	xzr, [x20, #16]
10df62b54:     	b	0x10df62b68
10df62b58:     	adrp	x0, 0x115880000
10df62b5c:     	add	x0, x0, #512
10df62b60:     	bl	0x107c24cb0
10df62b64:     	bl	0x107739f6c
10df62b68:     	bl	0x107c749c0
10df62b6c:     	mov	x0, x23
10df62b70:     	ldp	x29, x30, [sp, #64]
10df62b74:     	b	0x107c743f0
10df62b78:     	brk	#0x1
```

Bad:
```assembly
10def43bc:     	mov	x5, x30
10def43c0:     	bl	0x10df0c250
10def43c4:     	mov	x30, x5
10def43c8:     	stp	x29, x30, [sp, #64]
10def43cc:     	add	x29, sp, #64
10def43d0:     	bl	0x107c72c58
10def43d4:     	mov	x21, x0
10def43d8:     	tbnz	w2, #0, 0x10def43e4
10def43dc:     	mov	x8, x1
10def43e0:     	b	0x10def4400
10def43e4:     	bl	0x107c739c4
10def43e8:     	b.ge	0x10def4400
10def43ec:     	bl	0x107c739c0
10def43f0:     	b.pl	0x10def43f8
10def43f4:     	brk	#0x1
10def43f8:     	bl	0x107c739bc
10def43fc:     	csel	x8, x8, x1, gt
10def4400:     	bl	0x107c739b8
10def4404:     	cbz	x23, 0x10def442c
10def4408:     	adrp	x0, 0x1158f0000
10def440c:     	add	x0, x0, #3328
10def4410:     	bl	0x107c24d74
10def4414:     	lsl	x8, x23, #4
10def4418:     	add	x1, x8, #32
10def441c:     	bl	0x107c7283c
10def4420:     	bl	0x107c739b4
10def4424:     	bl	0x108aa160c
10def4428:     	b	0x10def4438
10def442c:     	adrp	x23, 0x111add000
10def4430:     	ldr	x23, [x23, #1616]
10def4434:     	bl	0x107c72d04
10def4438:     	add	x24, x23, #32
10def443c:     	add	x25, x20, #32
10def4440:     	tbz	w21, #0, 0x10def4468
10def4444:     	add	x8, x25, x22, lsl #4
10def4448:     	cmp	x23, x20
10def444c:     	ccmp	x8, x24, #0, eq
10def4450:     	b.hi	0x10def4460
10def4454:     	lsl	x2, x22, #4
10def4458:     	bl	0x107c72adc
10def445c:     	bl	0x10fe1508c
10def4460:     	str	xzr, [x20, #16]
10def4464:     	b	0x10def4478
10def4468:     	adrp	x0, 0x1157ec000
10def446c:     	add	x0, x0, #512
10def4470:     	bl	0x107c24d74
10def4474:     	bl	0x1077245ec
10def4478:     	bl	0x107c739b0
10def447c:     	mov	x0, x23
10def4480:     	ldp	x29, x30, [sp, #64]
10def4484:     	b	0x107c733dc
```

In the "Bad" assembly you can see that at `10def43fc` an additional `cset` is emitted as a consequence of moving `brk`.

I am wondering if it is possible that we do not move statically 0% probability cold blocks (or at least making it an option for Swift inputs)?

https://github.com/llvm/llvm-project/pull/109711