[llvm-dev] [RFC] Memory region declaration intrinsic

Clement Courbet via llvm-dev llvm-dev at lists.llvm.org
Thu Dec 23 00:56:18 PST 2021

On Wed, Dec 22, 2021 at 8:43 PM Nuno Lopes <nunoplopes at sapo.pt> wrote:

> The non-technical question is whether this matters at all. Do we expect
> any
> perf benefit from improving alias analysis for this case?

I've seen a number of very hot functions in our code where alias analysis
was at fault because of this.
The most impressive issue was in a compression algorithm, and the code was

struct Histogram {
  int total;
  int values[256];

Histogram DoIt(const int* image, int size) {
  Histogram histogram;
  for (int i = 0; i < size; ++i) {
  return histogram;

Because alias analysis is unable to tell that `histogram.total` and
`histogram.values[*]`  do not alias, the total has to be incremented one by
It was easy to fix by manually moving `histogram.total` outside of the
loop. And this made compression 1% faster overall, so it does actually
matter quite a lot.
Of course one might argue that this was not optimally written, but it was
written like this, and I've seen other cases where it's not as obvious.
This is the generated code right now:

0000000000000000 <_Z4DoItPKii>:
   0:   48 89 f8                mov    %rdi,%rax
   3:   85 d2                   test   %edx,%edx
   5:   7e 4d                   jle    54 <_Z4DoItPKii+0x54>
   7:   41 89 d0                mov    %edx,%r8d
   a:   83 fa 01                cmp    $0x1,%edx
   d:   75 04                   jne    13 <_Z4DoItPKii+0x13>
   f:   31 d2                   xor    %edx,%edx
  11:   eb 2f                   jmp    42 <_Z4DoItPKii+0x42>
  13:   44 89 c7                mov    %r8d,%edi
  16:   83 e7 fe                and    $0xfffffffe,%edi
  19:   31 d2                   xor    %edx,%edx
  1b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)                   ;
loop (unrolled 2 times)
  20:   48 63 0c 96             movslq (%rsi,%rdx,4),%rcx              ;
  24:   83 44 88 04 01          addl   $0x1,0x4(%rax,%rcx,4)        ; ++
  29:   83 00 01                addl   $0x1,(%rax)
    ; ++histogram.total
  2c:   48 63 4c 96 04          movslq 0x4(%rsi,%rdx,4),%rcx
  31:   83 44 88 04 01          addl   $0x1,0x4(%rax,%rcx,4)
  36:   83 00 01                addl   $0x1,(%rax)
  39:   48 83 c2 02             add    $0x2,%rdx
  3d:   48 39 d7                cmp    %rdx,%rdi
  40:   75 de                   jne    20 <_Z4DoItPKii+0x20>              ;
end loop
  42:   41 f6 c0 01             test   $0x1,%r8b
  46:   74 0c                   je     54 <_Z4DoItPKii+0x54>
  48:   48 63 0c 96             movslq (%rsi,%rdx,4),%rcx
  4c:   83 44 88 04 01          addl   $0x1,0x4(%rax,%rcx,4)
  51:   83 00 01                addl   $0x1,(%rax)
  54:   c3                      ret

When I let clang emit range information (note: this was with a previous
iteration of this proposal, but the results are the same), LLVM can now
hoist `histogram.total` out of the loop.

0000000000000000 <_Z4DoItPKii>:
   0:   48 89 f8                mov    %rdi,%rax
   3:   85 d2                   test   %edx,%edx
   5:   0f 8e 7d 00 00 00       jle    88 <_Z4DoItPKii+0x88>
   b:   41 89 d1                mov    %edx,%r9d
   e:   49 8d 49 ff             lea    -0x1(%r9),%rcx
  12:   45 89 c8                mov    %r9d,%r8d
  15:   41 83 e0 03             and    $0x3,%r8d
  19:   48 83 f9 03             cmp    $0x3,%rcx
  1d:   73 04                   jae    23 <_Z4DoItPKii+0x23>
  1f:   31 c9                   xor    %ecx,%ecx
  21:   eb 3d                   jmp    60 <_Z4DoItPKii+0x60>
  23:   41 83 e1 fc             and    $0xfffffffc,%r9d
  27:   31 c9                   xor    %ecx,%ecx
  29:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
 ; loop (unrolled 4 times)
  30:   48 63 3c 8e             movslq (%rsi,%rcx,4),%rdi
; image[i]
  34:   83 44 b8 04 01          addl   $0x1,0x4(%rax,%rdi,4)           ; ++
  39:   48 63 7c 8e 04          movslq 0x4(%rsi,%rcx,4),%rdi
  3e:   83 44 b8 04 01          addl   $0x1,0x4(%rax,%rdi,4)
  43:   48 63 7c 8e 08          movslq 0x8(%rsi,%rcx,4),%rdi
  48:   83 44 b8 04 01          addl   $0x1,0x4(%rax,%rdi,4)
  4d:   48 63 7c 8e 0c          movslq 0xc(%rsi,%rcx,4),%rdi
  52:   83 44 b8 04 01          addl   $0x1,0x4(%rax,%rdi,4)
  57:   48 83 c1 04             add    $0x4,%rcx
  5b:   49 39 c9                cmp    %rcx,%r9
  5e:   75 d0                   jne    30 <_Z4DoItPKii+0x30>
  60:   44 8b 08                mov    (%rax),%r9d
  63:   4d 85 c0                test   %r8,%r8
  66:   74 1a                   je     82 <_Z4DoItPKii+0x82>
  68:   48 8d 0c 8e             lea    (%rsi,%rcx,4),%rcx
  6c:   31 f6                   xor    %esi,%esi
  6e:   66 90                   xchg   %ax,%ax
  70:   48 63 3c b1             movslq (%rcx,%rsi,4),%rdi
  74:   83 44 b8 04 01          addl   $0x1,0x4(%rax,%rdi,4)
  79:   48 83 c6 01             add    $0x1,%rsi
  7d:   49 39 f0                cmp    %rsi,%r8
  80:   75 ee                   jne    70 <_Z4DoItPKii+0x70>
  82:   41 01 d1                add    %edx,%r9d
  85:   44 89 08                mov    %r9d,(%rax)
 ; histogram.total = iter count
  88:   c3                      ret
