<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/82206>82206</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[AArch64][SVE] Cannot be vectorized, but GCC can vectorize.(TSVC s161)
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
m-saito-fj
</td>
</tr>
</table>
<pre>
Clang cannot SVE vectorize TSVC s161, but GCC13.2.0 can.
Option:
`-Ofast -march=armv8.2-a+sve`
```c
#define LEN 32000
#define LEN2 256
static int ntimes = 200000;
float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN];
float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2];
int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN],
float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float);
int s161()
{
for (int nl = 0; nl < ntimes/2; nl++) {
for (int i = 0; i < LEN-1; ++i) {
if (b[i] < (float)0.) {
goto L20;
}
a[i] = c[i] + d[i] * e[i];
goto L10;
L20:
c[i+1] = a[i] + d[i] * d[i];
L10:
;
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
return 0;
}
```
See also (Clang vs GCC):
https://godbolt.org/z/sacK746KP
GCC result:
```asm
.L2:
lsl x5, x0, 2
ld1w z3.s, p0/z, [x27, x0, lsl 2]
ld1w z1.s, p0/z, [x26, x0, lsl 2]
add x6, x25, x5
add x7, x28, x5
ld1w z0.s, p0/z, [x6]
ld1w z2.s, p0/z, [x7]
fcmlt p1.s, p2/z, z3.s, #0.0
movprfx z2.s, p1/m, z0.s
fmla z2.s, p1/m, z1.s, z1.s
add x8, x24, x5
fcmge p1.s, p2/z, z3.s, #0.0
add x5, x23, x5
ld1w z3.s, p0/z, [x5]
st1w z2.s, p0, [x7]
ld1w z2.s, p0/z, [x8]
movprfx z0.s, p1/m, z2.s
fmla z0.s, p1/m, z1.s, z3.s
st1w z0.s, p0, [x6]
add x0, x0, x20
whilelo p0.s, w0, w19
b.any .L2
```
Loop Body IR:
```llvm
for.body4: ; preds = %for.cond1.preheader, %for.inc
%indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next.pre-phi, %for.inc ]
%arrayidx = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv, !dbg !21
%0 = load float, ptr %arrayidx, align 4, !dbg !21, !tbaa !22
%cmp5 = fcmp fast olt float %0, 0.000000e+00, !dbg !26
%arrayidx17 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv, !dbg !27
%1 = load float, ptr %arrayidx17, align 4, !dbg !27, !tbaa !22
br i1 %cmp5, label %L20, label %if.end, !dbg !21
if.end: ; preds = %for.body4
%arrayidx7 = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv, !dbg !28
%2 = load float, ptr %arrayidx7, align 4, !dbg !28, !tbaa !22
%arrayidx11 = getelementptr inbounds [32000 x float], ptr @e, i64 0, i64 %indvars.iv, !dbg !29
%3 = load float, ptr %arrayidx11, align 4, !dbg !29, !tbaa !22
%mul = fmul fast float %3, %1, !dbg !30
%add = fadd fast float %mul, %2, !dbg !31
%arrayidx13 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv, !dbg !32
store float %add, ptr %arrayidx13, align 4, !dbg !33, !tbaa !22
%.pre = add nuw nsw i64 %indvars.iv, 1, !dbg !34
br label %for.inc, !dbg !35
L20: ; preds = %for.body4
%arrayidx15 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv, !dbg !36
%4 = load float, ptr %arrayidx15, align 4, !dbg !36, !tbaa !22
%mul20 = fmul fast float %1, %1, !dbg !37
%add21 = fadd fast float %mul20, %4, !dbg !38
%5 = add nuw nsw i64 %indvars.iv, 1, !dbg !39
%arrayidx24 = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %5, !dbg !40
store float %add21, ptr %arrayidx24, align 4, !dbg !41, !tbaa !22
br label %for.inc, !dbg !40
for.inc: ; preds = %if.end, %L20
%indvars.iv.next.pre-phi = phi i64 [ %.pre, %if.end ], [ %5, %L20 ], !dbg !34
%exitcond.not = icmp eq i64 %indvars.iv.next.pre-phi, 31999, !dbg !42
br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4, !dbg !13, !llvm.loop !43
```
`-mllvm -debug-only=loop-vectorize` messages:
```
LV: Checking a loop in 's161' from s161.c:18:3
LV: Loop hints: force=? width=vscale x 0 interleave=0
LV: Found a loop: for.body4
LV: Not vectorizing: Found an unidentified PHI %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next.pre-phi, %for.inc ]
LV: Interleaving disabled by the pass manager
LV: Can't vectorize the instructions or CFG
LV: Can't vectorize due to memory conflicts
LV: We can vectorize this loop!
LV: Not vectorizing: Cannot prove legality.
```
According to the above message, multiple factors seem to prevent vectoring.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzEWV-PozgS_zTOixVkzJ-Ehzx0ksncaFuzq5vT3LPBJvEe2Bw26WQ-_ck2JBAgvT27uo1aHYyrflX-uapcEKIUPwrGNiDagmi_II0-yXpTLhXhWi7z3xeppNfNriDiCDMihNTw2_dP8MwyLWv-g8F_ffu-g8qPfYB3MG00_Lzb-YGHPWTkPYD2AL24_79WmksBgnYIYrT8NSdKw2VJ6uwEgj2py_Paw0sC8FadGYhRX98M7V_WjnFAWc4Fg6-fvsIAI4QmJjDEUezuK000zyAXGgrNS6YgCPbQ6CEEgm3fVl5IoiEB0fb101cQ7e3qBqNsMKKDEbuPOtgWsEXEZup-ZcDTuZksm5uhdHJmuBKzWtqU5RXgtXVi4OlfdMeZgrdPT2Ds9p-dxMnEEl0Irs2km1lth07lsoYAr-3mF3bjzZ67610bDwAfsLsJ8Nb-JXAENAHI73jcwr1--rr0zdDh8KdA3YfnBs_EAQfR3uLc9gwnyBuCHKWW8BXfA3cOFqz2rQC5Q-9t_LoB3trwbQcvNnp5P5DmkJ0L_t0F6067Ka0BvPU7i2TWIn20aEE7oFk_7it7-HTxTlza2my1SWpzE-9MHrqcc_mFd9DSO7I0slAz3dQC9spFJ3GrTm74jTFICiXNFrryeVamNlor7cpOWlfKjPAB4MNR0lQW2pP1EeDDD4APimS_rML4l9_6wf55t4M1U02h-5XU_RFVujveK77z134KVdjvS2TWe0HmP34Qof4b_BF4ysxVyLmxgyDaXvDqrmWQbEqOlSGEP_xJ_fg9fUKp889JYudmNCPk3DEJ3xO6uYCmXIjnXcZT8quRfJ6VhYZVt0DcSXeUARwgDw11Snmu6vxyt-EDfCitlvFyiF8WZEqwNWi_p-lwPOBwirM8K4_sg17fgN0u4GAK-Em4RCPulB5zPcfz831Zj-RvHKMRdXiS436MjGkOHnVuvqNH38cxdWMO3QP-gh_ofTvxghUSVi3gmxV785OhWOoRcYUmmadKTFsqpazgVtIr_PLPcUUoinNbEnJZe6aZC0FgSyqsakZdBwRwZGYzKajvVTU7MUJZ7ULDznCRdY4BHHFBz6RWHj9b7erEIY9DQwdEPaUHONie4UZsAOIJdtFGblmd-NAo7LELcETqmlw5vVizR6ZZwUomdKVryEUqG2HWE21tJwgvXSthrRoZECJb8I2zqLsYuOKs-zQ9mi_s92wja7SQhHZdSAt6d8seLAU_ChiOgNxYp4TYG7iHnJVVZMHzrKygbYhloZ0Va9kdT7ZLRQzgLUIP8HF3UN598Vc_TxL9EEmr3lL890nyV_M0rWZpSmvI_Y4se4SQlBXmhmk5-mOee0zQ6Y1su0UnEcy2NROfyXxx2TQOzz9BffYh6tc94_h96p8wv34WoLet839-YexDC0t61oM_EFP-_MqSZysrG_ckkJsLm3m3rAvaSuQP8QLUJ4ZSp24uhuplU7QA-AHAn2I2-HlmyUeYDW6rV1rW7O4voXSK2GCW2CCYIxbgyFRz1_JTCkXzBoV6m3btkd6wl_G3nO6OoKFoNDgF7ZPHBzL6o0ntR_-vLYp71sM_EPzR_B7F7wQ_RnPh78-E_6p31FCK_Sfxj7tm4NGrftmKfi5MkokdwuFfXHejodEQzeeOO-EfdsZ145M7E853BO8Gfjho_zqRjwT_KPD7h6Y9UaeavUGfNur82qxvMRzgQ8sX3Q3cZiYyH-CIXbg27aMnpLaWuGmN2H_HwTFqHgM_SZIHwsZtRN_AoH3oGlcvKxgRTRWMZl2BGBjwu1poum2vMA25MRs86dpBjJalEYdLytLmuJSiuIJgb5SXt7erIEawZEqRI1Pj3r6tfN_N3u9OLPsPF0dIoLXPBQR45V6KrWBey9K-IfNMoPhrELwEfW37DHHiQhsrMJd1xkCwB8EBvnGqTyDYn1VGCgYvEEEuNKsLRs5GZuDEwaRa60EL1K-nTuir1LfXx1wce3oCNoJTJjTPOaPwt398gX_b84bz9Uu3VMMs5YqkBaMwvUJ9YrAiSsGSCHJk9WAriAB4pXvvyI00F0rXTaa5FArKGu4On58r0YZBLWHJSllfYSZFXvBMq77SvxnMiBgY4sqx37W8s5zv3Mv8qpZnBgt2JAXXV-9JxL5kmaypIUJLuyKSGtU2PA2PZVNoXhUM5sSYUlAxVhrpqmZnJm4uiKO3oJuAJkFCFmzjr9A68JNghRenDYrpKiJhmqVpHPsrSuIwXuEUJ0mK19RfL_gGIxwi7CcoQglOPD9e45yEcYbDMI3y2DSbJeGFZ5NR1scFV6phmzXGKF7YVFb25w6MBXuDdhJgDKL9ot4YnWXaHBUIUcGVVncUzXVhfyd5eamzUxy6l9Pfvn8C0b5jM2X3zbDltP1BZLhNHsDr3s8myaKpi83Du0CuT03qZbIE-GCf4d3Xsqrl7yzTAB-s4wrgg13Y_wIAAP__KvjboQ">