<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/62065>62065</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [VPlan][OuterLoop] Performance investigation between inner loop and outer loop optimization
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            loopoptim,
            vectorization
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          PeixinQiao
      </td>
    </tr>
</table>

<pre>
    ## Case 1: inner loop is reduction and can be vectorized: inner loop vectorization is better
```
$ cat case.c
int A[1024], B[1024];
void foo(int iCount, int c, int jCount)
{
  int i, j;
#pragma clang loop vectorize(enable) vectorize_width(4)
  for (i = 0; i < iCount; i++) {
    A[i] = c;
    for (j = 0; j < jCount; j++) {
      A[i] += B[j] + i;
    }
  }
}
$ cat main.c 
extern void foo(int, int, int);
int main() {
  for (int i = 0; i < 1000; ++i)
    foo(1024, 3, 1024);
  return 0;
}
$ cat run.sh 
#!/bin/bash
echo "--------------- original ---------------"
clang case.c -O3 -c
clang main.c -c
clang main.o case.o && time ./a.out
rm *.o a.out
echo "--------------- optimized ---------------"
clang case.c -O3 -c -mllvm -enable-vplan-native-path=true
clang main.c -c
clang main.o case.o && time ./a.out
rm *.o a.out

$ bash run.sh 
--------------- original ---------------
real    0m0.059s
user    0m0.059s
sys     0m0.000s
--------------- optimized ---------------
real    0m0.313s
user    0m0.313s
sys     0m0.000s
```
## Case 2: inner loop cannot be vectorized: outer loop vectorization is better
```
$ cat case.c
int arr[1024][1024];
int arr2[1024];
void foo(int iCount, int n, int jCount)
{
  int i1, i2;
#pragma clang loop vectorize(enable) vectorize_width(4)
  for (i1 = 0; i1 < iCount; i1++) {
    arr2[i1] = i1;
    for (i2 = 0; i2 < jCount; i2++)
      arr[i2][i1] = i1 + n;
  }
}
$ # same main.c and run.sh
$ bash run.sh
--------------- original ---------------
real    0m6.005s
user    0m6.000s
sys     0m0.004s
--------------- optimized ---------------
real    0m1.046s
user    0m1.038s
sys     0m0.008s
```
Further investigation: different trip counts. and others.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy0VkuP4jgQ_jXmUiJyykkgBw48xGml6b3sdWWSghglNrIdZmZ__cpJgJAGaUYz04o6rnK9Hx-RzqmTJlqxdMPS3Uy2vjJ29UHqm9J_K2lmB1N-XzEUDAVspSOImViD0pos1MZcQDmwVLaFV0aD1CUUUsOB4EqFN1b9R-VE4XYhOw3l4EDek2V8x_iaZXx4ehITKKSHQjqKip6ntIc1Szcxx4SlO4Zb2IxIsenFrkaVcDSG4TJoqK1ptQ_CgSpuh_PAzgd_i0EbulsVxM53kwzFxcpTI6GopT49p0MMl6TloSaG-YP771dV-orhMrk7ATgaCyEuYGIHnIkNhOP2FmSgGW66J4dRTNAlrli66zSLe2ThajB6fhg9d0bPd6PnN0bHZnET1ENBzwMN6skNW-xuxP34OAz9aqTSUQE9k755shqeGzI04P7K705C4YM-w-Uk0lvZQmempYs576g-RTUqNgxOuwHBLYjwryfyUWaWfGt1Z_JNUrbVkavgPgsMY4b7Q4h0f5CuGrItKgMMcf78B8aqk9KyhskFQ-wV-5nqJx3mXwTMi_HFUNIXTNMrBa8Zwwy8aggihnsZmdb34rYBhuvIwIj3NtKLV01Y3J8JFeZNXV8bmPc7ML9eaqnnWnp1pflF-oqJnbct_dmUHg0LDXnu2A83pLdOsmY85w2PeJq7ntm6gFMTpvvubjzO3Rtfb0s6dSZi8dnZg_nK2SfMfIA1TrC3kFob_xmeTet_JzxLa0eI_BmbBxn8WdjWPwjbcSeHfw644xH8xFPojt_A7JCxim_4reJXAK5wZBwnEK7wbnyM333BFfblHnvoEFyP_LwG7TAxTjZ028nwQ95vz8uV-qWNyiLO0-mQZ6OBHg958osbFUc8yabO4oiL5Qtny9cbtW-tr8iC0ldyXp261QhrU6rjkSxpD96qCxShSy7qimeChotm5UqUucjljFZxtowRF7gQs2qVppSmfCn5IpWFzBZ0TIhKSpIi5yjlYqZWyFHwJI7jVCRCRCjivFxmMpeL43GBMUs4NVLVUcDdyNjTTDnX0ipDnqWzWh6odt13HWKY965gAcFxyxCftjxw093MroKl-aE9OZbwWjnvHra98nX3mfjPRy11P2dfAmr8ZcwljNsH2aOxjdQFPRcqAMhXIj3Goa5CD8wZmtmJz1pbryrvL46JNcM9w_1J-ao9RIVpGO5DRMNrfrHmTIVnuO9Sdwz3Xfb_BwAA__9oHv1w">