<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/60147>60147</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[DAG] Odd-sized vectors getting broken up between basic blocks
</td>
</tr>
<tr>
<th>Labels</th>
<td>
llvm:codegen,
loopoptim
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
RKSimon
</td>
</tr>
</table>
<pre>
https://godbolt.org/z/sYqn3PWbx
```
template <int N>
void scaleSumVec(int n, double s, const double *src, double *dst) {
typedef double __attribute__((__vector_size__(N * 8), __aligned__(8))) vecN;
const vecN *vsrc = (const vecN*)src;
vecN *vdst = (vecN*)dst;
vecN t = vsrc[0] * s;
for (int i = 1; i < n; ++i)
t += vsrc[i] * s;
vdst[0] = t;
}
void test_scaleSumVec(int n, double x, const double *y, double *dst) {
//scaleSumVec<16>(n, x, y, dst);
scaleSumVec<15>(n, x, y, dst);
}
```
While we handle odd-sized vectors well in straight line code, when they cross block boundaries they tend to get split up into scalars/smaller vectors - in loops in particular this leads to most of the code just splitting/concatenating/spilling/reloading the data.
```
define @test_scaleSumVec(i32 noundef %n, double noundef %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %dst) {
entry:
%0 = load <15 x double>, ptr %y, align 8
%splat.splatinsert.i = insertelement <15 x double> poison, double %x, i64 0
%splat.splat.i = shufflevector <15 x double> %splat.splatinsert.i, <15 x double> poison, <15 x i32> zeroinitializer
%mul.i = fmul <15 x double> %splat.splat.i, %0
%cmp11.i = icmp sgt i32 %n, 1
br i1 %cmp11.i, label %for.body.preheader.i, label %test_scaleSumVec.exit
for.body.preheader.i: ; preds = %entry
%wide.trip.count.i = zext i32 %n to i64
br label %for.body.i
for.body.i: ; preds = %for.body.i, %for.body.preheader.i
%indvars.iv.i = phi i64 [ 1, %for.body.preheader.i ], [ %indvars.iv.next.i, %for.body.i ]
%t.012.i = phi <15 x double> [ %mul.i, %for.body.preheader.i ], [ %2, %for.body.i ]
%arrayidx1.i = getelementptr inbounds <15 x double>, ptr %y, i64 %indvars.iv.i
%1 = load <15 x double>, ptr %arrayidx1.i, align 8
%2 = tail call <15 x double> @llvm.fmuladd.v15f64(<15 x double> %1, <15 x double> %splat.splat.i, <15 x double> %t.012.i)
%indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1
%exitcond.not.i = icmp eq i64 %indvars.iv.next.i, %wide.trip.count.i
br i1 %exitcond.not.i, label %test_scaleSumVec.exit, label %for.body.i, !llvm.loop !9
test_scaleSumVecexit: ; preds = %for.body.i, %entry
%t.0.lcssa.i = phi <15 x double> [ %mul.i, %entry ], [ %2, %for.body.i ]
store <15 x double> %t.0.lcssa.i, ptr %dst, align 8
ret void
}
declare <15 x double> @llvm.fmuladd.v15f64(<15 x double>, <15 x double>, <15 x double>) #1
```
Inner Loop:
```
.LBB0_4: # %for.body.i
vunpcklpd %xmm6, %xmm4, %xmm6 # xmm6 = xmm4[0],xmm6[0]
vunpcklpd %xmm8, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm8[0]
vunpcklpd %xmm11, %xmm7, %xmm7 # xmm7 = xmm7[0],xmm11[0]
vunpcklpd %xmm12, %xmm2, %xmm2 # xmm2 = xmm2[0],xmm12[0]
vunpcklpd %xmm13, %xmm9, %xmm8 # xmm8 = xmm9[0],xmm13[0]
vunpcklpd %xmm14, %xmm3, %xmm3 # xmm3 = xmm3[0],xmm14[0]
vunpcklpd %xmm15, %xmm10, %xmm4 # xmm4 = xmm10[0],xmm15[0]
vmovupd -288(%r8), %xmm9
vmovsd -272(%r8), %xmm10 # xmm10 = mem[0],zero
vinsertf128 $1, %xmm10, %ymm9, %ymm10
vinsertf128 $1, %xmm5, %ymm4, %ymm4
vunpcklpd %xmm0, %xmm0, %xmm5 # xmm5 = xmm0[0,0]
vinsertf128 $1, %xmm0, %ymm5, %ymm5
vinsertf128 $1, %xmm8, %ymm3, %ymm3
vunpcklpd %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[0,0]
vunpcklpd %xmm0, %xmm0, %xmm9 # xmm9 = xmm0[0,0]
vinsertf128 $1, %xmm9, %ymm8, %ymm9
vfmadd231pd -320(%r8), %ymm9, %ymm3 # ymm3 = (ymm9 * mem) + ymm3
vinsertf128 $1, %xmm7, %ymm2, %ymm2
vunpcklpd %xmm0, %xmm0, %xmm7 # xmm7 = xmm0[0,0]
vunpcklpd %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[0,0]
vinsertf128 $1, %xmm8, %ymm7, %ymm7
vfmadd231pd -352(%r8), %ymm7, %ymm2 # ymm2 = (ymm7 * mem) + ymm2
vfmadd231pd %ymm10, %ymm5, %ymm4 # ymm4 = (ymm5 * ymm10) + ymm4
vinsertf128 $1, %xmm6, %ymm1, %ymm1
vunpcklpd %xmm0, %xmm0, %xmm6 # xmm6 = xmm0[0,0]
vunpcklpd %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[0,0]
vinsertf128 $1, %xmm8, %ymm6, %ymm6
vfmadd231pd -384(%r8), %ymm6, %ymm1 # ymm1 = (ymm6 * mem) + ymm1
vmovupd -160(%r8), %xmm8
vmovsd -144(%r8), %xmm10 # xmm10 = mem[0],zero
vinsertf128 $1, %xmm10, %ymm8, %ymm8
vfmadd231pd -256(%r8), %ymm6, %ymm1 # ymm1 = (ymm6 * mem) + ymm1
vfmadd231pd -224(%r8), %ymm7, %ymm2 # ymm2 = (ymm7 * mem) + ymm2
vfmadd231pd -192(%r8), %ymm9, %ymm3 # ymm3 = (ymm9 * mem) + ymm3
vfmadd213pd %ymm4, %ymm5, %ymm8 # ymm8 = (ymm5 * ymm8) + ymm4
vmovupd -32(%r8), %xmm4
vmovsd -16(%r8), %xmm10 # xmm10 = mem[0],zero
vinsertf128 $1, %xmm10, %ymm4, %ymm15
vfmadd231pd -64(%r8), %ymm9, %ymm3 # ymm3 = (ymm9 * mem) + ymm3
vfmadd231pd -96(%r8), %ymm7, %ymm2 # ymm2 = (ymm7 * mem) + ymm2
vfmadd213pd %ymm8, %ymm5, %ymm15 # ymm15 = (ymm5 * ymm15) + ymm8
vfmadd231pd -128(%r8), %ymm6, %ymm1 # ymm1 = (ymm6 * mem) + ymm1
vmovupd 96(%r8), %xmm4
vmovsd 112(%r8), %xmm8 # xmm8 = mem[0],zero
vinsertf128 $1, %xmm8, %ymm4, %ymm10
vfmadd231pd (%r8), %ymm6, %ymm1 # ymm1 = (ymm6 * mem) + ymm1
vextractf128 $1, %ymm1, %xmm4
vpermilpd $1, %xmm4, %xmm6 # xmm6 = xmm4[1,0]
vfmadd231pd 32(%r8), %ymm7, %ymm2 # ymm2 = (ymm7 * mem) + ymm2
vpermilpd $1, %xmm1, %xmm8 # xmm8 = xmm1[1,0]
vextractf128 $1, %ymm2, %xmm7
vpermilpd $1, %xmm7, %xmm11 # xmm11 = xmm7[1,0]
vpermilpd $1, %xmm2, %xmm12 # xmm12 = xmm2[1,0]
vfmadd231pd 64(%r8), %ymm9, %ymm3 # ymm3 = (ymm9 * mem) + ymm3
vextractf128 $1, %ymm3, %xmm9
vpermilpd $1, %xmm9, %xmm13 # xmm13 = xmm9[1,0]
vpermilpd $1, %xmm3, %xmm14 # xmm14 = xmm3[1,0]
vfmadd213pd %ymm15, %ymm5, %ymm10 # ymm10 = (ymm5 * ymm10) + ymm15
vextractf128 $1, %ymm10, %xmm5
vpermilpd $1, %xmm10, %xmm15 # xmm15 = xmm10[1,0]
addq $512, %r8 # imm = 0x200
leaq (%rdi,%rcx), %r9
addq $4, %r9
addq $4, %rcx
cmpq $1, %r9
jne .LBB0_4
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzMWt9v4yr2_2voC2pksJ04D31o2m-_Wu3V3dVeaa_2KSKGJMxg8ABuk_nrV2A7xg5107kz0o6qDjGHz_n1OQdMQ4zhB8nYA8g3IH--I409Kv3wr7__wSsl73aKnh-O1tYGpI8AvwD8clB0p4RdKH0A-OU7wC_mP99k-s8_dyeQPIPkESyT7sd_tKyqBbEMgvSJSwt_B-n_tTOvilNoSiLYH031b1YCXDgBCfATpKrZCQaNG5dKGts_AfjR6DIQAfiRGgvwGoLVpgWGEEJ7rhll-15quyXWar5rLNtuAS4ALrbbV1ZapbeGf28f_u7AYAHw2uFvt0S42FA_1z51P_CVlb-DtNM1aGzNdJMO5tXoEoL0GQJcDDMAPwK8dg7064cF1Nh-wSDqXLtW5de00k4PyDcJyJ-99WZAhhDulYZdWLkXRyDd-OETlG4I8AbgDXeeXRb58PmZAZ_H8Z3RF-3pMwyMXT2HVvtcW2bsdj7hp1jCzx-lu2VmiJw-oaUjGi48uodtYfziwInJqvyWVYNzY663v_88csHgG4NHIqlgUFF67zhGYUs4A9-YEJBLaKwm_HC0UHDJYKkoc8rejkxCe2RnWGplDNwJVX6FO9VISjRnpp2zTFJoFTwwC00tuIVNDbm0yntEtHERqYgQTF_03julQqnauEFNtOVlI4iG9sgNFIxQ4yArZSxUe6fHGwW_NKZTYrl0ZV8qWRLLJOk-m5oL0Q41E4pQLg9-OSWWLMLgTCJG2d65DrIkRo4UQ-ncZnsIcB7yJHjss1RbDaUqSW0bPcxqRqiS4uzEzu-LvWluWS83ZReTVp9d97twLU88152b0FMGnjqzPHdaLb1G30Ng0a0GODeuGy78by4N03bRVmb7gQlWMWmvcWGtuFFyVAmt63yZwSSwLlDQQZtjs98L1rIgAh23ymHPmdHP8RS7ie9MKy655UTw70wHBlWN6AzZV434UH-nGeehU2VVI9RHqqxqaA7Wab7wAvXCOw05Cpa4SUF2TLhne6UXbk9b1JodGaFMTwSmLFywE7chf6MI6SN0zbTWjJquh-ctbwYP3jhlC6t5vShVI_vUfGenwQ9Xe3yZBZ5cG86jxngTbvt3ZWgA0sY96uLgCJf0lWiz4K-dD_WRexKCfAPRDAYE-bOfzjcTHMlO9lp9u2BQbBcJwoHOCJFaZE-42w3Bc4oBzonW5MzpqeffgfVl6gqdS9-ZzcetwMdoHL_AO3RbUwmMuW4v3pt2IyZcwJKIaLlliRCv1cJVI6F08Yry_TJzh6JYZaJ4H4jWbEyqy1pwwojl3htNKIWyeYPSvEWDFZa5q7ATt6WSdCGVDVsD-xZZPaLYVS1eNY8x9g0tItplOn3Ih9vtu-7DOqzgKZjHmtbyxyU7bTZ2kSxEaQz5bLl4oM_UBzRWafZu6nsrAgb7DXbKXM0sdGfE6OGRslKQuJKbqRzl5zsP1xDgFEXPK3-Tkmn4m1L15UgwkVj8ttkk2-yGhgxwGmvt8LWRdflV1PQil5-qatll4VRV2TBcXpD8B5drL9CeyQF-8iu7T3PwxYCJgmEIj3p4NIIvboFHAegqGAbwqx5-NYJH6CZ8PICGwwAf9_h4jI9vwk8H0PUwLAL8osdfj_HTm_CDnAaq0gA_7fHTMX52E34eJDUJqDTgZ5f0JmMF-URBpV6bmsJ7_1JcAJzr_pW5j8_oVdLJGwrhPV7hqDxKIoXRPncWVawa7HGnzAl8e17dI1x0izMU8fU85O3sH3fOzK7OhxVZOBwbcAl4uyiIbjDMQ8_yPtRtpPFTEN6b3Aq8ysPhJyCKYV0aDudYFHetgLEqiLj2GdB1CLqeA531Msh5ETJhHKh9RSjFKeosu09xcs3UMYNSb9u5L0uAi7O3Ez96wvo9ZAODiN6UlNWgAIfDdwj3cRhXMNZhZ2j3K_L-WT6uwmGX5asU5ZFmMl6L-xThIEWrSIrwO0ouvSJWbFnv_Llvna2C3Cvo1l1UZDexdRl0qHD4A_0mfjD4H0_8Mhx-UKJFFs1_GMI-PShIzzKS_2mA-w0OLSNtwFsc2-DuURYx6afub-_vbUU4nI8czpe_LHJXunA8S3-lSiO67tE63g1-RsNu9aC0LbjxUSDoB0NRnPuimDSEItIPppRL40ekiLg_UqFIKmOM-1VHqiAUKH-vVS_jHBjl5rPJuVKyjpN6RLQf3A986of9oIimH-VB-lEe3RDyQMdHVYpw5HD9c_tbLGTvcw2hODWL8Ubwl6hVxJmVzMfq4ziFqflkrNjJalJeDA7sPY_elC8bfM10xcN9NEMzb-7vvLyjyRl34nGsS1yR_dNsn7UcTbf_yAkARSyfjR8OT77juM8aE9wgIHTpbSi8QEDxk8gsbGAOwhfY0b3BO7CT_NzU8X50P5qNaHr9Gj7vc7hrpJPNIg1vNKaJnUUNzEDZBDUL7zHiRJ90XJTHW24SlHXy4RF82KDmazp8af8Me8KF7W4w-JyP7lbiJCKUfutQ88t1lm7bK68qj5CccDLphoKRb0EXpBzgJzcoTwP3dM-FQEc2nbs24yJSnsYyZVV_mwTgAvNFMi_T34OO70jv6ENK1-ma3LEHtFxlKEOrZX53fFgti7LMabZDCO_xerXfFXtc4tV6lxG6LvZ3_AEnOE0QWqMc4yxflAlZZcUOMUxznKQ5yBJWES4W_kZY6cMdN6ZhD8sEZas7fzlv_NdcMHYSIH0sFWUHJgHGPmZYKFWr2vLKPcmf7_SDE7zfNQcDskRwY80AbrkV_mszz4__D_Jn-I-rv_IfmP9zOdxp9ZVJ2NRwx-wbYxLuiOFl-3d9c9doMf2KDbfHZrcoVQXwi7e1_e--1uoLKy3AL941A_CL9-6_AQAA__-b_5dc">