<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/162797>162797</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[AArch64] Masked load/store generation missing for Streaming-SVE with -march=armv8-a+sme.
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
ShikharjQUIC
</td>
</tr>
</table>
<pre>
For the below given c code with characteristics-> 1) where streaming attribute is used to execute the function in streaming mode 2) Scalabale Vectorization is enabled via pragma 3) Predicated Control flow inside loop body that data-depends on atleast one of the arrays/vectors
For ex:
__arm_locally_streaming
void foo1 (char *AA, char *BB, char *CC, char *DD, int N, int T, char *EE) {
#pragma clang loop vectorize_width(16, scalable)
for (int idx = 0; idx < N; idx++) {
if (EE[idx] >= DD[idx]) {
AA[idx] = AA[idx] + CC[idx];
BB[idx] = BB[idx] * CC[idx];
}
}
}
When compiled with -march=armv8+sme generates a disassembly which doesn't have masked load/stores.
For reference a section of generated disassembly for above is mentioned below[That contains scalar indexed loads]:
.LBB0_21:
cmphs p8.b, p1/z, z16.b, z17.b
mov z19.b, p8/z, #-1
mov z19.b, z19.b[5]
fmov w30, s19
tbnz w30, #0, .LBB0_39
.LBB0_22:
cmphs p8.b, p1/z, z16.b, z17.b
mov z19.b, p8/z, #-1
mov z19.b, z19.b[6]
fmov w30, s19
tbnz w30, #0, .LBB0_40
.LBB0_23:
cmphs p8.b, p1/z, z16.b, z17.b
mov z19.b, p8/z, #-1
mov z19.b, z19.b[7]
fmov w30, s19
tbnz w30, #0, .LBB0_41
.LBB0_24:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[8]
fmov w30, s19
tbnz w30, #0, .LBB0_42
.LBB0_25:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[9]
fmov w30, s19
tbnz w30, #0, .LBB0_43
.LBB0_26:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[10]
fmov w30, s19
tbnz w30, #0, .LBB0_44
.LBB0_27:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[11]
fmov w30, s19
tbnz w30, #0, .LBB0_45
.LBB0_28:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[12]
fmov w30, s19
tbnz w30, #0, .LBB0_46
Where as if for the same c-code we remove the streaming attribute and compile it with -march=armv8+sve then the disassembly contains the full masked load/stores:
.LBB0_15:
ld1b { z0.b }, p0/z, [x6, x11]
ld1b { z1.b }, p0/z, [x3, x11]
cmphs p1.b, p0/z, z0.b, z1.b
ld1b { z0.b }, p1/z, [x0, x11]
ld1b { z1.b }, p1/z, [x2, x11]
ld1b { z2.b }, p1/z, [x1, x11]
ld1b { z3.b }, p1/z, [x2, x11]
add z0.b, z1.b, z0.b
add z1.b, z3.b, z2.b
st1b { z0.b }, p1, [x0, x11]
st1b { z1.b }, p1, [x1, x11]
add x11, x11, x9
cmp x10, x11
b.ne .LBB0_15
cmp x10, x8
b.ne .LBB0_3
Thus such behavior/instructions are also desired incase of -march=armv8+sme.
For more understanding pls refer to this godbolt reproducer: https://godbolt.org/z/5bd3zhvTP
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzUWE1vIykT_jX4UrIFtD8PPrjtWHqld0ezSnb2GNFQdjNDgwXYTvzrV3S3vzL2zmaUSyJLNF31UPVUFaRoEYJeW8QpGeRksOiIbSydnz6W-kcp_Pc___rfvFM49TpdOg-xRCjQuD2s9Q4tSJBOIex1LEGWwgsZ0esQtQxdkj0AI3wC-xI9QogeRaXtGkSMXhfbiKADbAMqiA7wBWV6lSystlZG7SxoewGrkiWeFoRHKYwohEH4hjI6rw-i0Q-AVhQGFey0gI0X60pAljBfPSotRUQFc2ejdwZWiYe2QSsE49wGEk2IpYigRBRdhRu0KoCzIKJBESI4i-BWtZPCe_EaCF_uahcCoTMgdJaihC8kmxE6e34Wvno2TgpjXp9PTAid7ZxWsHKOAeHjFDggfDabET6H4yzPL2fz-eVssUgzbSN8OT48XcofHhJlMsqTUwCEZ20opBF23ZDdtZHD571WsSR8zIZpjVDH1iDhkxq9cmnJcbKh1QuQbAGUZHk7mcOXdkJ4Xv8u7IJeJejDAxnkSWOwAJI9pBUWi9OrawQAzGaX6ovrOc9hPj9jsxqX59eIqzmf3UCkoIwW9VMztjNCZ3-Xqa5dtdGpjOrK7lbCy5JkC-Gr3ZjwPFQIa7ToRcQAApQOIgSsCvMK-1LLEpTDYAkfRSjFDqES4QcqME4owpchOo-h1xo8Vo3HFXq0EkFAwGYDuNXJjrqykrIiCrer91CFNmmjavYmGeRPqYils1FoG5qUetBW4UvrRahjUdvu_T_P6TNnTc2CrDZlAIDNuFeketgwwpeH9HRgw-bVgY16xTlhULldPR7YpMWMjxjCsy5Lqj_pNA-DfJA8Oa-1ahX3Ga2rkU0uhLGwhwsh4c3YMMgmZzK8JdP-vYfTfyFzj_eR0_AGp_cT6tMzoez3Cb0jSb_iNfqoXPXZmVr_mtom-i2mMWts7wwb3mLeijfZbzDP3s18_GHM-Zn54DMwn3wY8-zMfPgZmDP6YdT7Z-qjT0GdfRj1wZn6-FNQ5x9GPbE5dxUeQYTUEq3aRjqICkF2mw4awWOV_p_Xkhu9srDq2JaAjvcak2YBW69y2S-ceoGmvzbmZkvS5ueUMfbmhDKKFVD3TjkcaK-om6cUZnoK8yB_qdvIl7cldIVl97DZTeypFFib1hMmedFk76oQ7jrKLo3R9zl6heW_xvJ7WHYfWwOz9xkVSjWlfBmLY2yu1U6idk8lHy9UQrwftn8L2RWO3cLdpnz0PAlaeRom16lvVU6mz8KiZ9MBAqdqvY8bv4W1TSMQOql_ddk_ldsAYStLKLAUO-084UttQ_TbuiEPINI2NsGBwqA9KtBWilDfCm9dE3oXDX7lPMLWKvQhCqvS9t6Y0PT96QocSx1g7VThTASPG-_UVqJPbXoZ46benXxJ-LLV6Tm_bqpjOShUdih3T187apqpSTYRHZyy0ZDScTZkrFNO-xNkE85wUsgBk3I8XImi6A85jhUdspXs6CmnfMAoo3RMhwPWU5QxpIUS2TCjVFDSp1gJbXrG7Kpku6ND2OKUDfloMuoYUaAJ9VcEzi3uoZYSno7Tjp8mULfYrgPpU6NDDOdloo6m_vwwm3lZDvvp2vbHz6fT8SaU7kWVDiHFLx2lj8fDsvv47eHWudgVbS46W2-mb0KpY7ktetJVhC-TR-3Q3Xj3HWVM6U880kW_Jbqb8n8CAAD__9aKoRc">