<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/162797>162797</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [AArch64] Masked load/store generation missing for Streaming-SVE with -march=armv8-a+sme.
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          ShikharjQUIC
      </td>
    </tr>
</table>

<pre>
    For the below given c code with characteristics-> 1) where streaming attribute is used to execute the function in streaming mode 2)  Scalabale Vectorization is enabled via pragma 3) Predicated Control flow inside loop body that data-depends on atleast one of the arrays/vectors
 
For ex:
__arm_locally_streaming
void foo1 (char *AA, char *BB, char *CC, char *DD, int N, int T, char *EE) {
   #pragma clang loop vectorize_width(16, scalable)
  for (int idx = 0; idx < N; idx++) {
    if (EE[idx] >= DD[idx]) {
       AA[idx] = AA[idx] + CC[idx];
 BB[idx] = BB[idx] * CC[idx];
    }
  }
}
 
When compiled with -march=armv8+sme generates a disassembly which doesn't have masked load/stores.
 
 
For reference a section of generated disassembly for above is mentioned below[That contains scalar indexed loads]: 
.LBB0_21:
 cmphs   p8.b, p1/z, z16.b, z17.b
        mov     z19.b, p8/z, #-1
 mov     z19.b, z19.b[5]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_39
.LBB0_22:
        cmphs   p8.b, p1/z, z16.b, z17.b
 mov     z19.b, p8/z, #-1
        mov     z19.b, z19.b[6]
        fmov w30, s19
        tbnz    w30, #0, .LBB0_40
.LBB0_23:
        cmphs   p8.b, p1/z, z16.b, z17.b
        mov     z19.b, p8/z, #-1
        mov     z19.b, z19.b[7]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_41
.LBB0_24:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[8]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_42
.LBB0_25:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[9]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_43
.LBB0_26:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[10]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_44
.LBB0_27:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[11]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_45
.LBB0_28:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[12]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_46

 
Where as if for the same c-code we remove the streaming attribute and compile it with -march=armv8+sve then the disassembly contains the full masked load/stores:
 
.LBB0_15:
        ld1b    { z0.b }, p0/z, [x6, x11]
        ld1b    { z1.b }, p0/z, [x3, x11]
        cmphs   p1.b, p0/z, z0.b, z1.b
        ld1b    { z0.b }, p1/z, [x0, x11]
        ld1b    { z1.b }, p1/z, [x2, x11]
        ld1b    { z2.b }, p1/z, [x1, x11]
        ld1b { z3.b }, p1/z, [x2, x11]
        add     z0.b, z1.b, z0.b
        add z1.b, z3.b, z2.b
        st1b    { z0.b }, p1, [x0, x11]
        st1b    { z1.b }, p1, [x1, x11]
        add     x11, x11, x9
        cmp     x10, x11
        b.ne    .LBB0_15
        cmp     x10, x8
        b.ne .LBB0_3            
 
Thus such behavior/instructions are also desired incase of -march=armv8+sme.
 
For more understanding pls refer to this godbolt reproducer: https://godbolt.org/z/5bd3zhvTP
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzUWE1vIykT_jX4UrIFtD8PPrjtWHqld0ezSnb2GNFQdjNDgwXYTvzrV3S3vzL2zmaUSyJLNF31UPVUFaRoEYJeW8QpGeRksOiIbSydnz6W-kcp_Pc___rfvFM49TpdOg-xRCjQuD2s9Q4tSJBOIex1LEGWwgsZ0esQtQxdkj0AI3wC-xI9QogeRaXtGkSMXhfbiKADbAMqiA7wBWV6lSystlZG7SxoewGrkiWeFoRHKYwohEH4hjI6rw-i0Q-AVhQGFey0gI0X60pAljBfPSotRUQFc2ejdwZWiYe2QSsE49wGEk2IpYigRBRdhRu0KoCzIKJBESI4i-BWtZPCe_EaCF_uahcCoTMgdJaihC8kmxE6e34Wvno2TgpjXp9PTAid7ZxWsHKOAeHjFDggfDabET6H4yzPL2fz-eVssUgzbSN8OT48XcofHhJlMsqTUwCEZ20opBF23ZDdtZHD571WsSR8zIZpjVDH1iDhkxq9cmnJcbKh1QuQbAGUZHk7mcOXdkJ4Xv8u7IJeJejDAxnkSWOwAJI9pBUWi9OrawQAzGaX6ovrOc9hPj9jsxqX59eIqzmf3UCkoIwW9VMztjNCZ3-Xqa5dtdGpjOrK7lbCy5JkC-Gr3ZjwPFQIa7ToRcQAApQOIgSsCvMK-1LLEpTDYAkfRSjFDqES4QcqME4owpchOo-h1xo8Vo3HFXq0EkFAwGYDuNXJjrqykrIiCrer91CFNmmjavYmGeRPqYils1FoG5qUetBW4UvrRahjUdvu_T_P6TNnTc2CrDZlAIDNuFeketgwwpeH9HRgw-bVgY16xTlhULldPR7YpMWMjxjCsy5Lqj_pNA-DfJA8Oa-1ahX3Ga2rkU0uhLGwhwsh4c3YMMgmZzK8JdP-vYfTfyFzj_eR0_AGp_cT6tMzoez3Cb0jSb_iNfqoXPXZmVr_mtom-i2mMWts7wwb3mLeijfZbzDP3s18_GHM-Zn54DMwn3wY8-zMfPgZmDP6YdT7Z-qjT0GdfRj1wZn6-FNQ5x9GPbE5dxUeQYTUEq3aRjqICkF2mw4awWOV_p_Xkhu9srDq2JaAjvcak2YBW69y2S-ceoGmvzbmZkvS5ueUMfbmhDKKFVD3TjkcaK-om6cUZnoK8yB_qdvIl7cldIVl97DZTeypFFib1hMmedFk76oQ7jrKLo3R9zl6heW_xvJ7WHYfWwOz9xkVSjWlfBmLY2yu1U6idk8lHy9UQrwftn8L2RWO3cLdpnz0PAlaeRom16lvVU6mz8KiZ9MBAqdqvY8bv4W1TSMQOql_ddk_ldsAYStLKLAUO-084UttQ_TbuiEPINI2NsGBwqA9KtBWilDfCm9dE3oXDX7lPMLWKvQhCqvS9t6Y0PT96QocSx1g7VThTASPG-_UVqJPbXoZ46benXxJ-LLV6Tm_bqpjOShUdih3T187apqpSTYRHZyy0ZDScTZkrFNO-xNkE85wUsgBk3I8XImi6A85jhUdspXs6CmnfMAoo3RMhwPWU5QxpIUS2TCjVFDSp1gJbXrG7Kpku6ND2OKUDfloMuoYUaAJ9VcEzi3uoZYSno7Tjp8mULfYrgPpU6NDDOdloo6m_vwwm3lZDvvp2vbHz6fT8SaU7kWVDiHFLx2lj8fDsvv47eHWudgVbS46W2-mb0KpY7ktetJVhC-TR-3Q3Xj3HWVM6U880kW_Jbqb8n8CAAD__9aKoRc">