<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/120993>120993</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [llvm] Incorrect vectorized codegen for Cortex-M 55
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            backend:ARM,
            llvm:codegen
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          PiJoules
      </td>
    </tr>
</table>

<pre>
    ```
struct Functor {
  void assignCoeff(float &dst, const float &src) { dst += src; }
};

struct Kernel {
  float *dst;
  const float *src;
  Functor f;

  void assignCoeff(int i) {
    f.assignCoeff(dst[i], src[i * 4]);
 }
};

void func(Kernel &kernel, int size) {
  //#pragma clang loop vectorize(disable)
  for (int i = 0; i < size; ++i)
 kernel.assignCoeff(i);
}
```

Clang seems to produce incorrect codegen when compiling this snippet with `/usr/local/google/home/leonardchan/misc/clang-cipd-latest-2/bin/clang++ -mcpu=cortex-m55 -mthumb -mfloat-abi=hard -march=armv8.1-m.main+mve.fp+fp.dp --target=arm-none-eabi -O2  -c /tmp/test.cc -fno-unroll-loops`

```
_Z4funcR6Kerneli:
        .fnstart
@ %bb.0:
 .save   {r7, lr}
        push    {r7, lr}
        cmp     r1, #1
 it      lt
        poplt   {r7, pc}
.LBB0_1:
        ldrd    r0, r2, [r0]
        add.w   r3, r2, r1, lsl #4
        subs    r3, #12
 cmp     r0, r3
        bhs     .LBB0_5
@ %bb.2:
        add.w   r3, r0, r1, lsl #2
        cmp     r2, r3
        bhs     .LBB0_5
@ %bb.3:
 dls     lr, r1
        .p2align        2
.LBB0_4: @ =>This Inner Loop Header: Depth=1
        vldr    s0, [r2]
 vldr    s2, [r0]
        adds    r2, #16
        vadd.f32        s0, s0, s2
        vstmia  r0!, {s0}
        le      lr, .LBB0_4
        b .LBB0_7
.LBB0_5:
        adr     r3, .LCPI0_0
        vldrw.u32       q0, [r3]
        vadd.i32        q0, q0, r2
        dlstp.32        lr, r1
 .p2align        2
.LBB0_6:                                @ =>This Inner Loop Header: Depth=1
        vldrw.u32       q1, [q0, #256]!
 vldrw.u32       q2, [r0]
        vadd.f32        q1, q1, q2
 vstrw.32        q1, [r0], #16
        letp    lr, .LBB0_6
.LBB0_7:
 pop     {r7, pc}
        .p2align        4
@ %bb.8:
.LCPI0_0:
 .long   4294967040                      @ 0xffffff00
        .long 4294967104                      @ 0xffffff40
        .long   4294967168 @ 0xffffff80
        .long   4294967232 @ 0xffffffc0
```

It looks like emits two separate loops that can be taken in `func` depending on if there's potential overlap between `src` and `dst`. This check is done under `.LBB0_1` which will continue to `.LBB0_5` if no overlap is detected. At a high level, the check is `if (dst >= src + size * 16 - 12 || src >= dst + size * 4) goto .LBB0_5`. This branch is where the vectorized instructions are emitted. Effectively, `.LBB0_5` *should* add four initial negative offsets to the first multiple of four elements of src (`src[i]`, `src[(i+1) * 4]`, `src[(i+2) * 4]`, `src[(i+3) * 4]`, then `.LBB0_6` iterates through each `dst` and multiple of `src` then add and store them back to `dst`, but I think the offset calculation is slightly off.

Added some comments to the relevant bits:
```
.LBB0_5: 
 adr     r3, .LCPI0_0
        vldrw.u32       q0, [r3]         // load the 4 32-bit vals into q0
        vadd.i32        q0, q0, r2       // add each val against src
                                         //   q0.0 = src - 256
 //   q0.1 = src - 192
 //   q0.2 = src - 128
 //   q0.3 = src - 64
        dlstp.32        lr, r1           // lr = size
.LBB0_6: 
        vldrw.u32       q1, [q0, #256]!  // load the 4 32-bit vals from q0 into q1 + 256 and q1 += 256
 //   q1.0 = *(src)
                                         // q1.1 = *(src + 64)
                                         //   q1.2 = *(src + 128)
                                         //   q1.3 = *(src + 192)
        vldrw.u32       q2, [r0]         // load 4 32-bit values from dst
        vadd.f32        q1, q1, q2       // add each 32-bit val from src and dst
                                         //   q1.0 = *(src) + dst[0]
                                         //   q1.1 = *(src + 64) + dst[1]
                                         //   q1.2 = *(src + 128) + dst[2]
                                         //   q1.3 = *(src + 192) + dst[3]
        vstrw.32        q1, [r0], #16    // store the 4 32-bit vals into dst then dst += 16
...
.LCPI0_0:
        .long   4294967040 @ 0xffffff00 // -256
        .long   4294967104 @ 0xffffff40 // -192
        .long   4294967168                      @ 0xffffff80 // -128
        .long   4294967232                      @ 0xffffffc0 // -64
```

Each of the offsets for the element accesses are 64 bytes apart when I think they should instead be 16 bytes since the access is `src[i * 4]` and `src` points to floats which should be 4 bytes. This can cause the `vldrw.u32 q1, [q0, #256]!` to access bad/uninitialized memory. I can verify with

```
__attribute__((optnone, noinline))
void testfunc() {
  constexpr int kSize = 2;
  float dst[kSize] = {1.0f, 2.0f};
  float src[] = {3.0f, -1.0f, -1.0f, -1.0f,
                          4.0f, -1.0f, -1.0f, -1.0f,
                          4.0f, -1.0f, -1.0f, -1.0f,
 4.0f, -1.0f, -1.0f, -1.0f,
                          12.0f, -1.0f, -1.0f, -1.0f,
  };
  Kernel k;
  k.dst = dst;
  k.src = src;
 func(k, kSize);
}
```

that the vectorized code does indeed access the 16th element (`12.0f`) of `src` rather than the 1st element `4.0f`. Nothing seems to stand out from IR corresponding to the vectorized bits:

```
19: ; preds = %19, %16
  %20 = phi i32 [ 0, %16 ], [ %29, %19 ]
  %21 = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %16 ], [ %30, %19 ]
  %22 = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %20, i32 %1)
  %23 = getelementptr inbounds nuw float, ptr %5, i32 %20
  %24 = shl nsw <4 x i32> %21, splat (i32 4)
  %25 = getelementptr inbounds nuw i8, ptr %7, <4 x i32> %24
  %26 = tail call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %25, i32 4, <4 x i1> %22, <4 x float> poison), !tbaa !11, !alias.scope !13
  %27 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %23, i32 4, <4 x i1> %22, <4 x float> poison), !tbaa !11, !alias.scope !16, !noalias !13
  %28 = fadd <4 x float> %26, %27
  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %28, ptr %23, i32 4, <4 x i1> %22), !tbaa !11, !alias.scope !16, !noalias !13
  %29 = add nuw i32 %20, 4
  %30 = add <4 x i32> %21, splat (i32 4)
 %31 = icmp eq i32 %29, %18
  br i1 %31, label %32, label %19, !llvm.loop !18

32:                                               ; preds = %19, %33, %2
  ret void
```

so I would suspect it's a backend issue when lowering.
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJy8WV9z4yqy_zTKS5dVEvpj-yEPdjKpm3vPuXvq7D7tyxSSkMUGgQaQPbOffqsBybLjZDIzWyeVsi3o_93QPxA1hh8kY_dRsY-Kxzs62k7p-z_4_6pRMHNXqebbfVQm4T_ZGavH2sLTKGurNETrfZTsAI6KN-CFPSjWthHZtEJRCxEpG2Mj8gC1ksbCPGp0HZEtCoDG4Mg-yh4BR7M9ROvHKNnhZ4byz3r_j2nJxKx2krZDHZkfu9Sz8xLdzGR0O0u9aTiXFniwzdEAtPElCaor9jwqHtEzVFHsOWqD3I1tg8rXfjh97SjriGwmZ0j54n6hLFRu-L_ZUn9Entx_Nmh66CnUgsoDCKUGODL0yNFvGm5oJZDTxwbTE7wBjG2CkcWfD14DxpnsI7LngcVbceUqn70JziyKIUp2D84Ww1hvwCoYtGrGmgGXtdKa1RZq1bADk3DqmIRa9QMXXB7AdtyAkXwYmIUTtx2gSPI0Gh2RJ6FqKiLydFDqgB49darHL8GUpLqpOyoj8tRzU0fkyYVjVfOhWQlqmbErEpGnistpznsJq74exih7rJW27OuqLwpY9bYb-wpWvauXFa14lD12VDew6qmuuyh7pLo_buJ01cc9RZn7_sjidojIvh3iZoDVylJ9YNaTrqSSbMVoxWH1NwKwqjF9th_wkxkb1zWsWqlWo9RKiBVm0czRXAb38z9zrJM_S18mPMp2oRrxL26lsVRbZMoTiEhRVXESaGJDjwwAC0ivsaqE9smbuIfRdPAeQd0P7lunOBuRLMVJbv2ssEtZahB2KWuovaz4t_0--Zxe2i0a3TjBCZJq4sQXe53gujmT0aaJT0iWncm8LcLgisnyBbEZKwMzMRpLcHb2wavKFhxVZ3wUnYnFRRDJpcFXliSvLCG3wkZ-SGUWVDbC0wgdtCzyPRAq-EFOz2QOcB5lO3Cyssco-_QPXFjPUjINv-EO8T-MNkwjzSMbLBb0UuxRNNqFMJkSQUIi5pl3UmTOvmLUy6VgjFubkTlHTkH4XIbsaGzPqc-SL7b13iSX1SgYLOIyub2IbRhbz0EprpOo4Vwh8W8Pfzwnn5OrOJzicbb3yxyP7NJr5xc_--UJv0zVfCZshLFDfCZc5vTtZJaYqO_8_UquL3xMg4_BWZKRonTtK50K4IL8rUK4TrWXGz5dSI7G6lN8TTGLulE-gtnhOuPlHKV1yO6g_HJ7tfVMYq4DnV8su40XM5dD2DyFkgekJdt8W66TPHk7DcnX1v0ly1ryAgJ7muTfZ89fs8_603JzQbt5h5Zk5IK2Tl437GeLyOHFgOAvDFjPrQF7UmDYQDW1zOEKA7ajFmoqoWJg6QuTwCW2aAddygQaNjDZYCdXEngLtmOaRWRtYFCWScupAHVkWtABKmZPjDl2hEplAlQ2-IQwqkxicHVcd6x-AW6gUZLBKBumkWbqImUCp47XHZy4EAjyLJcjQ8wxExVIxFuQalaN4phltWVNDDsLFDp-6ECwo4dbtmNnxVGZ8BY8vIMo-xQAKaIkh5gcwEtLWEFKIFo_ROsHP-9JA4w9k-YI4g7KKjjbF5ytNJV1h0pPGDhnx4zlGuDS412upAGqfZ6cD5_altWWH5n45tbNhe8Idzs1iga106aBVo0auOQuHZIdKHKCalvDrMNrqLfl2ljoR2H5IHDWszHBeiatwQEfhU1IYEC-ZRIs8GMeK-5Th1wnJHybhnyAJrtBYztfRdN-gOm2DKsWC1ar8dABo3V3ri1XaUvXzjXohGGQkMRY5bPQQ0Xrl1BWXgZqrkYLzwha5YuLmQ8h1FTUo6CYJ8ylEfzQWfENp2O_3HZNwxowqmcIfn1AQ9w1E-xIpYWKW-M3n-VaPTcywBX_iy3svO-4wwQIRRtnRQ4ZWVXcwpEKg8cPhZwfbXiXQjGaLv5HKoAeKJaxOx0l321q1_ahotj1OVd8K8DehCeqxXS6mE635HqaLKfJ5no6W0yX-Qc692sjhfYy8DR10cF_rut-LzmtVj18SUKSUrfZkKJ0Bewf0ZpXgUpDHCOyi8jGn7t_IiNf0hDwWY4zoMx_TpwzjNwQiKn6eYnZLYlbcinxfWhzLddlY5mJkYVc4P7wURx0KXNeKWepXiTajPm8FP1DIXidbRcFf2dxhd1-SPCb6V_IT39B_tvVsFBAfkHB28WxUHAN-D-GXReq5l5ya29FkOA6z-LSyyHfOI5fY9HwdwOSXqLPSfUqLP7bfIhFL2HnzBe2zzf4ys0bsb3ApWdhfrO9LYycQ_m2sPoszG3NVxj2E64c1S46sXHXXfgcYAvQumbGMA-fyhyqb4gS6EC19XdRi3b-DTxscriL0QZBb1oGFsNl7dPpRQak-OrS7wxrA8AYFA_d3l0umYBfg6oKy8NpmPAvlVDT0XhdUZmcd6n3TmqIZNRkWkWbiDyNMkA-hyV71iv9LYZnp-HING-_uRu3G1dOn6m1mlejZZ8_I94jGzVYqSRDtVJxKbh0F4xuO3V3mZYZG-4zl3eW7haWfR20u9F8-bvDxNidpstYf0Hrl5ybxp3XLc_1Po2TFjUS_J4uTyeWgBLP1FmgXqVv_Xh3v8j_CvZfVJKSj_EvghUul1-m55fYn2rcOeU86M8v08U7DoZkvqBwn5h3L4DdQfHq-FKrhkGjGO55DWPNVJ5Ilpa2mxepP1N47xBoby8huqZ4rMSzqPS8xp5ZyyT3bDH8v8KlvLiGNhaXohqtb6rPf4K7jjaD8mfWgMAXJi8w-KWL6dYBumwPg2aNCR2kSLd-JRbh2iIiBfGNd-g4IF6Oij0kMxFMzaLYO9qZfQtTv8HhdBYRZQ85fEVJUfbJs2UPKNfJxB_p9INMPzI8ib6hMUtua_RN11Iu8DQjZr2pU5snQhz7-MBsTN2xMxZUsrin5iU-5jzFkwEe2NBfjMBkCeqZMBdO-M57YDakb7C4MVRqlI0BOZ782nZXOFYjR7GQRJJZTu6LtRMgzek6Rhg_d784CPfux5mVL80ovmcG3yxscFdKr3Xks7jyduy8L4vwYbhYEx9cOcfHvM1IfMyHBMvfswxWT-Jnz_Ol-nSaJovRWdGguFEymqoqtRWl-J2G6_uUCk5NbGo1MDeezT6sf8wHBMPBA2d_CBXJ_gqryzAulZu58mTjPGkRWr9yApMV6p-sHcfZY9fGrtx0GG7p5y2Jy1r5QAD-S25unZvopSvY5cqbKzNLZqIfWSPI6XcgXvcDsC-z-Hm78siu0sBTT-7ehNDKvccsMnLxOG2SqQute2GJvmz8LosWfRzHB5D45j6cZVN-nYWaWZfY1w3LKHiGk8NgZjQDqy1w624uqbv7YbIBbszIPEwU6sQ0l4f4rrnPmm22pXfsPl1n-XaTbLL0rrtP0nZdtOtis23rql6ztkqTjFSU5nlSb_Pijt-ThOQpIVm6TdZpGm_SrGJJmyUsafNqm0Z5wnrKRezipPThzhlwn5Jku83uXDyNe0VPSDAxyna7P3-P3MKKCEHGKNuFl604XDze6XscXlXjwWB5c2PNWYPlVrjX_o61eITn-ZXtVSc_MOnQ9YN_b_o7FMXdqMV9Z-3geqZD6gduu7GKa9VH5MnJ9F-rQat_sdpG5Mk5ZSLyFPw63pP_BAAA___5vYeL">