<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/114891>114891</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Loop Vectorizer chooses small vectorization factor VF when known trip count isn't a multiple of it.
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
MatzeB
</td>
</tr>
</table>
<pre>
I am looking into vectorization for AVX512 in a case where the loop trip count isn't a multiple of the ideal VF of 16.
A simplified version of the problem looks like this[1]:
```
void foosum56(float *dest, const float *values) {
float buf[56];
memcpy(buf, dest, sizeof(buf));
// #pragma clang loop vectorize_width(16)
for (int i = 0; i < 56; i++) {
buf[i] += values[i];
}
memcpy(dest, buf, sizeof(buf));
}
```
Compiling with something like `clang -O3 -S -o - -mavx512f test.c` the loop vectorizer chooses a vectorization factor of 8 and the resuling code is using ymm registers:
```
...
vmovups (%rsi), %ymm0
vmovups 32(%rsi), %ymm1
vmovups 64(%rsi), %ymm2
vmovups 96(%rsi), %ymm3
vaddps (%rdi), %ymm0, %ymm0
vaddps 32(%rdi), %ymm1, %ymm1
vaddps 64(%rdi), %ymm2, %ymm2
vaddps 96(%rdi), %ymm3, %ymm3
vmovups 128(%rsi), %ymm4
vaddps 128(%rdi), %ymm4, %ymm4
vmovups 160(%rsi), %ymm5
vaddps 160(%rdi), %ymm5, %ymm5
vmovups 192(%rsi), %ymm6
vaddps 192(%rdi), %ymm6, %ymm6
vmovups %ymm0, (%rdi)
vmovups %ymm1, 32(%rdi)
vmovups %ymm2, 64(%rdi)
vmovups %ymm3, 96(%rdi)
vmovups %ymm4, 128(%rdi)
vmovups %ymm5, 160(%rdi)
vmovups %ymm6, 192(%rdi)
...
```
- Ideally though we would use zmm registers/operations for the first couple elements and a ymm register/operation for the remaining 8 elements.
- Manually adding `#pragma clang loop vectorize_width(16)` produces a poor result too: While it does nicely use zmm for the first elements, the remaining 8 elements are scalarized...
This still reproduces on LLVM trunk (on 8b55162e195783dd27e1c69fb4d97971ef76725b from Oct 29).
Filing this to document the issue while I am trying to figure out how this situation could be improved...
[1] Simplified llvm-ir:
```
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-redhat-linux-gnu"
define void @foosum120(ptr %dest, ptr readonly %values) #0 {
entry:
%buf = alloca [56 x float], align 16
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(224) %buf, ptr noundef nonnull align 4 dereferenceable(224) %dest, i64 224, i1 false)
br label %for.body
for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, ptr %values, i64 %indvars.iv
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds [56 x float], ptr %buf, i64 0, i64 %indvars.iv
%1 = load float, ptr %arrayidx2, align 4
%add = fadd float %0, %1
store float %add, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 56
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body ; , !llvm.loop !7
for.cond.cleanup:
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(224) %dest, ptr noundef nonnull align 16 dereferenceable(224) %buf, i64 224, i1 false)
ret void
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2
attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-f
eatures"="+avx,+avx2,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
attributes #2 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) }
!7 = distinct !{!7, !8}
!8 = !{!"llvm.loop.vectorize.width", i32 16}
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJykOEtv47rOv0bdEA5sOXacRRZ9oMABzuAs5sN8d3egWHSiW1nyleSkmV9_QfmRR9POXJwicCmJFJ8iKQnv1c4gbljxxIqXB9GHvXWbbyL8xKeHrZWnzR8gWtDWvimzA2WChQPWwTr1UwRlDTTWweOPfxUZB2VAQC08wnGPDiHskSg7CE51UNveBFDeML4KIKDtdVCdRrBNxFQShYYfrzTOygVLX1j6OHwfwau206pRKOGAzhPjkaxzdqtxENGDVm_EV3lWPGWseGH5uAUr0_EXhwerJDTW-r4tSsarRlsRgPFHiT4w_gy1NT7APH0QukfP-BrY6mnYYlzc9g0rnooyMpuWoMW27k6MV7TMn2Ha1qufaJtpfk2_iYjxV8ZfgfG8c2LXCqi1MLvBgJPJ8e-jkmHPeJWVRDtyIx8wXimyL7D8BVKWP0XwGYoywow_xd-lAgCj9IoVL0DL-QuMmg6TFxqx1csH5SatRiW_Um6ivvHD8H22bac0BdhRhT1422LY0zC6k5XpYIrkrxyS75BYSCBpxeG9yHgDAX1Y1KxMz_E2m8tBvbfWowdxG7aCRhREFQgjI61D30cpaisRlIfe0-jUtuBwp3xA588BdU-dxWKMWzi09tB3ntzCeOG8igZ5BsaLU9umZw_Q34Sc8_vo2X30cnkfnd9HX5f30fMbdCFl52GSXN5K_qkSI92shLxV4lN9BspZHXmrzqeajSxnzeStZh-UnIyREcE9ayzvszjjy1v8D6QzjzK9z6P4hMeML2_xP5DOPNafxEz5CY_1J-4pP5KeY_jC8xe0d6Pswtk3kXBv0-jbG8d_sW106I23v0CPvrn13Bf40c63XvgCP9rs1qLXieBuvkvgDyp2-gRhb_vdHo4IR9trCb1H-HmVb_ir7dDFpOVjoqdM1SjnA1VUqp-osUUTfMxj4ipdXVLPxA5boQxltmqmXUyCfROmj5IJKQmFJP9fKlKZUkWWfR1zbmetG9JqgGAtyx_h__dKI6gA0qIHo2rUp1nta_0m4cjKnwkOwiH4WmhBwsiz3eP3__bKgw9Ka3A4y2UN_Pnnj28QXG_eKKStgWpbFFnJMVsXqyqXkq8wq8t1s13K9Wq9yrBZlStebKFxtoW_6gCxwF1xex1qGLUfECxIW_ck49DdeN9TX0TKx5YquFNEttCoXe8QbB9gb48DuVehH7xWx7jYIqi2c_bwQcXxO7Q78P3cJ2l9aBPlPuuAgnA7DCBFEFqciDlVf8Y5Ji3LHzHp-Cpl-WPO44eG2fWQoHIZP4maATpu-WPGq6Sp0hEycaqcyMtl8j2eSn4lCnWJGicx3qvy73KZOJR7ERKtTP-e7Ew_Ew1fiY0yCLGdY8t06OgyTie4C9QUFVOPQkOHQlqjTzR_0dHxPD13RWiCO81GozJYbPsmSiW0trWA2O7B-9D_UZPEn0FotTOQzUm3FlrPUpEnFkPTtOhS-qmY8kgkY3sjsQFjjem1njcCiQ4bdGhqFFuNjFecLwdxi7Hf-px--SX5ZBJVLiHOPoPKoBHa40XC2zrQYouaKBrrFnQbuDT9PDfbivFCGXkQzi_UIVqs26vIhRVPMHUO0cAw2o0WrsgWBt_DiDlxiMgzC-GcOCn5HhnsMIyZgIyhzJbM4UfXjCa6cPag8xXD88Zp3FFbIT_QTzzPrl5exMe0yr8S6V7UjLuP7iTR0q9kpMns10LyT6SUMtI2BEwXm2LyytyS-WAdnteFlL_P4saNw6GREkx_BOOPdxSj-nmxAb6rUFsjF8YO1KpuO8D_fKSc46QoLwJWZbe7EMpVGMeFWqMwffdhcYi2nGKSrJLFkxvrHePZ6jb6r7Y6Z4x_ePZ_7-z-09zx9dl3GKIGt5e3MefWmqru76kotBIejK1FF6jMHZ0KSEn4rMUtypSmRzlHGVXbCrcbE_ZVERAhOLXtA_oxl1MFWT2BsY1DBGMd1r3zBPmTqaPdjspIushaRxdZ4XYtUtmLvKOIxKg_BrIflaNWmUTjTuhk6HySse3hLH9hnAoOJzRjk-BE1ymzS1pxgRBcjyOOD6J-Szpnw7DTtm8adAndnmf0asQdSmNSd_289F6VCRnlcr0Za5cg-_kZlfEncaCkNQB8hujiPAzq1h5GyNX5iFG_VwOAByTcYdBkZT1CrRiBd-8GqG1HPp3tahMG2FNQjQCfoXyGlovsAp4xZpT3ajUCXhwm84Xe4JVBdmjQqTouT5F6HRJ8Dom296FzdufQDxGn9VbUb-dQGSPjqLR2GHpnfhkkN6eDEkVkJ5UPytSURTPqLiiBDFmlOpPwrBobnhGHcT4nncXcZC_maKOzkHPqND68qDzITS7X-Vo84CZb5emqzCqeP-w3q2VWcZnzfCUEF_U6zURdLMWyrrbFtqjzB7XhKV9mWbrkeZZyvmgaIZqmWvEqLYttnbNlSr23XkTZrNs9xH52k2XLap09xBzq4yMi5waPQ7dL4hYvD24TO9Ftv_OULJQP_rxNUEHj5k_KsD8-Ptr4NqbSew83P17huEcDb8Yeza8fGFVYPPROb_YhdPENJz627VTY99tFbVvGX0mm8R8dzn9jHRh_jZrQLWxU9bDh_w0AAP__gfdDIQ">