<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/124872>124872</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
SROA can make a mess out of sequence of memcpys causing to poor optimization
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
gbaraldi
</td>
</tr>
</table>
<pre>
The difference seems to be in https://github.com/JuliaLang/llvm-project/blob/2a7ed2c1aaf5c84280d947eea56daaf302eb83d1/llvm/lib/Transforms/Scalar/SROA.cpp#L3481-L3513 whether we take the extract + trunc path or the copyload +insert path. Of which LLVM can further optimize the copyload + insert much better. The only difference here is wheter I copy the whole alloca or just half of it. I thought this could've been some undef stuff but it looks like it's SROA thinking it's generating good code but actually making things worse.
```llvm
source_filename = "pad"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-darwin24.3.0"
define void @good(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, i64 signext %1) local_unnamed_addr {
%3 = alloca [8 x i8], align 1
store i64 %1, ptr %3, align 1
%4 = alloca [16 x i8], align 1
%5 = getelementptr inbounds i8, ptr %3, i32 1
%6 = getelementptr inbounds i8, ptr %3, i32 2
%7 = getelementptr inbounds i8, ptr %3, i32 3
%8 = getelementptr inbounds i8, ptr %3, i32 4
%9 = getelementptr inbounds i8, ptr %3, i32 5
%10 = getelementptr inbounds i8, ptr %3, i32 6
%11 = getelementptr inbounds i8, ptr %3, i32 7
%12 = getelementptr inbounds i8, ptr %4, i32 0
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %12, ptr align 1 %3, i64 1, i1 false)
%13 = getelementptr inbounds i8, ptr %4, i32 1
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %13, ptr align 1 %5, i64 1, i1 false)
%14 = getelementptr inbounds i8, ptr %4, i32 2
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %14, ptr align 1 %6, i64 1, i1 false)
%15 = getelementptr inbounds i8, ptr %4, i32 3
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %15, ptr align 1 %7, i64 1, i1 false)
%16 = getelementptr inbounds i8, ptr %4, i32 4
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %16, ptr align 1 %8, i64 1, i1 false)
%17 = getelementptr inbounds i8, ptr %4, i32 5
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %17, ptr align 1 %9, i64 1, i1 false)
%18 = getelementptr inbounds i8, ptr %4, i32 6
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %18, ptr align 1 %10, i64 1, i1 false)
%19 = getelementptr inbounds i8, ptr %4, i32 7
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %19, ptr align 1 %11, i64 1, i1 false)
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %0, ptr align 1 %4, i64 8, i1 false)
ret void
}
define void @bad(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, i64 signext %1) local_unnamed_addr {
%3 = alloca [8 x i8], align 1
store i64 %1, ptr %3, align 1
%4 = alloca [16 x i8], align 1
%5 = getelementptr inbounds i8, ptr %3, i32 1
%6 = getelementptr inbounds i8, ptr %3, i32 2
%7 = getelementptr inbounds i8, ptr %3, i32 3
%8 = getelementptr inbounds i8, ptr %3, i32 4
%9 = getelementptr inbounds i8, ptr %3, i32 5
%10 = getelementptr inbounds i8, ptr %3, i32 6
%11 = getelementptr inbounds i8, ptr %3, i32 7
%12 = getelementptr inbounds i8, ptr %4, i32 0
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %12, ptr align 1 %3, i64 1, i1 false)
%13 = getelementptr inbounds i8, ptr %4, i32 1
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %13, ptr align 1 %5, i64 1, i1 false)
%14 = getelementptr inbounds i8, ptr %4, i32 2
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %14, ptr align 1 %6, i64 1, i1 false)
%15 = getelementptr inbounds i8, ptr %4, i32 3
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %15, ptr align 1 %7, i64 1, i1 false)
%16 = getelementptr inbounds i8, ptr %4, i32 4
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %16, ptr align 1 %8, i64 1, i1 false)
%17 = getelementptr inbounds i8, ptr %4, i32 5
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %17, ptr align 1 %9, i64 1, i1 false)
%18 = getelementptr inbounds i8, ptr %4, i32 6
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %18, ptr align 1 %10, i64 1, i1 false)
%19 = getelementptr inbounds i8, ptr %4, i32 7
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %19, ptr align 1 %11, i64 1, i1 false)
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %0, ptr align 1 %4, i64 16, i1 false)
ret void
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
```
Output
```llvm
source_filename = "pad"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-darwin24.3.0"
define void @good(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, i64 signext %1) local_unnamed_addr {
%.sroa.02.0.extract.trunc = trunc i64 %1 to i8
%.sroa.2.0.extract.shift = lshr i64 %1, 8
%.sroa.2.0.extract.trunc = trunc i64 %.sroa.2.0.extract.shift to i8
%.sroa.3.0.extract.shift = lshr i64 %1, 16
%.sroa.3.0.extract.trunc = trunc i64 %.sroa.3.0.extract.shift to i8
%.sroa.4.0.extract.shift = lshr i64 %1, 24
%.sroa.4.0.extract.trunc = trunc i64 %.sroa.4.0.extract.shift to i8
%.sroa.5.0.extract.shift = lshr i64 %1, 32
%.sroa.5.0.extract.trunc = trunc i64 %.sroa.5.0.extract.shift to i8
%.sroa.6.0.extract.shift = lshr i64 %1, 40
%.sroa.6.0.extract.trunc = trunc i64 %.sroa.6.0.extract.shift to i8
%.sroa.7.0.extract.shift = lshr i64 %1, 48
%.sroa.7.0.extract.trunc = trunc i64 %.sroa.7.0.extract.shift to i8
%.sroa.8.0.extract.shift = lshr i64 %1, 56
%.sroa.8.0.extract.trunc = trunc i64 %.sroa.8.0.extract.shift to i8
%.sroa.0.0.insert.ext = zext i8 %.sroa.02.0.extract.trunc to i64
%.sroa.0.0.insert.mask = and i64 undef, -256
%.sroa.0.0.insert.insert = or i64 %.sroa.0.0.insert.mask, %.sroa.0.0.insert.ext
%.sroa.0.1.insert.ext = zext i8 %.sroa.2.0.extract.trunc to i64
%.sroa.0.1.insert.shift = shl i64 %.sroa.0.1.insert.ext, 8
%.sroa.0.1.insert.mask = and i64 %.sroa.0.0.insert.insert, -65281
%.sroa.0.1.insert.insert = or i64 %.sroa.0.1.insert.mask, %.sroa.0.1.insert.shift
%.sroa.0.2.insert.ext = zext i8 %.sroa.3.0.extract.trunc to i64
%.sroa.0.2.insert.shift = shl i64 %.sroa.0.2.insert.ext, 16
%.sroa.0.2.insert.mask = and i64 %.sroa.0.1.insert.insert, -16711681
%.sroa.0.2.insert.insert = or i64 %.sroa.0.2.insert.mask, %.sroa.0.2.insert.shift
%.sroa.0.3.insert.ext = zext i8 %.sroa.4.0.extract.trunc to i64
%.sroa.0.3.insert.shift = shl i64 %.sroa.0.3.insert.ext, 24
%.sroa.0.3.insert.mask = and i64 %.sroa.0.2.insert.insert, -4278190081
%.sroa.0.3.insert.insert = or i64 %.sroa.0.3.insert.mask, %.sroa.0.3.insert.shift
%.sroa.0.4.insert.ext = zext i8 %.sroa.5.0.extract.trunc to i64
%.sroa.0.4.insert.shift = shl i64 %.sroa.0.4.insert.ext, 32
%.sroa.0.4.insert.mask = and i64 %.sroa.0.3.insert.insert, -1095216660481
%.sroa.0.4.insert.insert = or i64 %.sroa.0.4.insert.mask, %.sroa.0.4.insert.shift
%.sroa.0.5.insert.ext = zext i8 %.sroa.6.0.extract.trunc to i64
%.sroa.0.5.insert.shift = shl i64 %.sroa.0.5.insert.ext, 40
%.sroa.0.5.insert.mask = and i64 %.sroa.0.4.insert.insert, -280375465082881
%.sroa.0.5.insert.insert = or i64 %.sroa.0.5.insert.mask, %.sroa.0.5.insert.shift
%.sroa.0.6.insert.ext = zext i8 %.sroa.7.0.extract.trunc to i64
%.sroa.0.6.insert.shift = shl i64 %.sroa.0.6.insert.ext, 48
%.sroa.0.6.insert.mask = and i64 %.sroa.0.5.insert.insert, -71776119061217281
%.sroa.0.6.insert.insert = or i64 %.sroa.0.6.insert.mask, %.sroa.0.6.insert.shift
%.sroa.0.7.insert.ext = zext i8 %.sroa.8.0.extract.trunc to i64
%.sroa.0.7.insert.shift = shl i64 %.sroa.0.7.insert.ext, 56
%.sroa.0.7.insert.mask = and i64 %.sroa.0.6.insert.insert, 72057594037927935
%.sroa.0.7.insert.insert = or i64 %.sroa.0.7.insert.mask, %.sroa.0.7.insert.shift
store i64 %.sroa.0.7.insert.insert, ptr %0, align 1
ret void
}
define void @bad(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, i64 signext %1) local_unnamed_addr {
%.sroa.0.0.extract.trunc = trunc i64 %1 to i8
%.sroa.0.1.extract.shift = lshr i64 %1, 8
%.sroa.0.1.extract.trunc = trunc i64 %.sroa.0.1.extract.shift to i8
%.sroa.0.2.extract.shift = lshr i64 %1, 16
%.sroa.0.2.extract.trunc = trunc i64 %.sroa.0.2.extract.shift to i8
%.sroa.0.3.extract.shift = lshr i64 %1, 24
%.sroa.0.3.extract.trunc = trunc i64 %.sroa.0.3.extract.shift to i8
%.sroa.0.4.extract.shift = lshr i64 %1, 32
%.sroa.0.4.extract.trunc = trunc i64 %.sroa.0.4.extract.shift to i8
%.sroa.0.5.extract.shift = lshr i64 %1, 40
%.sroa.0.5.extract.trunc = trunc i64 %.sroa.0.5.extract.shift to i8
%.sroa.0.6.extract.shift = lshr i64 %1, 48
%.sroa.0.6.extract.trunc = trunc i64 %.sroa.0.6.extract.shift to i8
%.sroa.0.7.extract.shift = lshr i64 %1, 56
%.sroa.0.7.extract.trunc = trunc i64 %.sroa.0.7.extract.shift to i8
store i8 %.sroa.0.0.extract.trunc, ptr %0, align 1
%.sroa.2.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 1
store i8 %.sroa.0.1.extract.trunc, ptr %.sroa.2.0..sroa_idx, align 1
%.sroa.3.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 2
store i8 %.sroa.0.2.extract.trunc, ptr %.sroa.3.0..sroa_idx, align 1
%.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 3
store i8 %.sroa.0.3.extract.trunc, ptr %.sroa.4.0..sroa_idx, align 1
%.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 4
store i8 %.sroa.0.4.extract.trunc, ptr %.sroa.5.0..sroa_idx, align 1
%.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 5
store i8 %.sroa.0.5.extract.trunc, ptr %.sroa.6.0..sroa_idx, align 1
%.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 6
store i8 %.sroa.0.6.extract.trunc, ptr %.sroa.7.0..sroa_idx, align 1
%.sroa.8.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 7
store i8 %.sroa.0.7.extract.trunc, ptr %.sroa.8.0..sroa_idx, align 1
%.sroa.9.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 8
store i64 undef, ptr %.sroa.9.0..sroa_idx, align 1
ret void
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
```
SROA Output https://godbolt.org/z/e3hs5qcfr
O1 Output https://godbolt.org/z/ExbTxx1ra
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzsm19vozoaxj-Ne2M1wn-Bi1yk0610VrMaac9ob0cGTPCpwTm2mbbz6VdA0hIgQFOtdFanUqRhwO_jnx9s89pQ4ZzaV1JuAbsD7P5G1L4wdrtPhBU6UzeJyV623wsJM5Xn0soqldBJWTroDUwkVBUsvD84QHYAPwD8sFe-qJNNakqAH_5ZayW-imoP8IPWP8vbgzV_yNQD_JBokwD8gEUoM5wiIXKWRhRHQRbTUErBeCZETgIsk4hk6CjQ_KOauO9WVC43tnQAP_yeCi1sc_Dvb7tNejgATL4SGqHbr4QhAp8K6Qtp4ZOEXjxK6AsJ5bO3IvUQ4DvobV2l8CB8AY1tr6bm8KKNyJrLqnLS-vbyBn7L4VOh0gJ-_fqff8FUVDCvbStuDl6V6pccxcOjQFmnBUyk99JuYGOpqfRL39dCWgmVa3Glhb-1Kq3cU2G0hEJrk4oG8Y_aeVgInUOTQ-U38DfoC1PvCw99oRxMTa0zgMOfEiZSVtCZUsK6ymQOna_zHCa1h8pDbcyjg1o9Sqg8wKGDjYONRvWoqv3p5F5W0grfnNkbk8HUZLKVEKmvhdYvsBRt-SZw7-CTsU5uQLBrfjzofu3tC3bO1DaVP3KlZSVKCQG5hwDjg8gAxiDYeWH30sNMeKHFi6n9qYS8LQHZmVvFKSA7Tm8VwhEgO4Sj24rg7tzvzbm-jrfqoF9rEbbk9FYcDlreZsI-qQrTDdkEXQgIdpnMVSXhT6MyCGjQtBbg6OAtrIzQSjhYmVQcfG0lrEznaGWqqtYaOis9wBFgd4jDZ6giwO4BjqHQal9BBDNp5fFWi0RLgCPEm-sAswDgL1BxCtvB-Nz0Soaaa80N1z_qqrEq-yGyzEIQ3oFgB5sipG3XsVsAdhe9VfvlVG1b1nnTdC1Oj8JfYNOkRmFYEmBGB6r91owKs7bwXnqpZSkr3-iqKmmscU3QoCpF8Fssf39sc5uaE-H7Q8mp2uj9sfQUG78_lp1iUfD-YP4ajN4fHL4G49XB9BQctMGp0Pp1NDQjeFPKMj28bA5B82vGYjc8Tp28re2k1ztJTj287X0KwVxoJwGOjzcUkfcjousRyQQiW0Kk70fE1yPSCUS-hLh-ONLzcXEVIptADJcQ1496ej78rkLkE4jREuL62YWej_KrEMMJxHgJcf0kRs_nkqsQowlEFCwxrp8s6fmUdRVjPMWI5hivqSaYqIWeKokmKoFW-raSJsEI7yfTjER8Zhl_uSzjM834TDM-04zPNOMzzfhMM_5aaUbX29bkGW2qkWph5SqAcfbxZJWXptIvJ55xEStFdirRSrVoqiyF3XeJBwk6FOG9VUntpWtPdpsy4V2rpXUi0kdYmdzKLul5UlUGn5TWVvraVrCUpbEvAEfC7ktZArJrq24J23peG33acer--632h9p_bkb9r9PEjbNGbAK8CTbHvdVNt6_aNLc7OqWG0JuGqBu_XVw_zBUq75zWrrBnCWV0XtuKyi7JvyK8FSLrGBC_HDYLMdYf-0DXMWB6zkDXMoz1J4xg6yAIvhw2CzHWn4Dg6yBocOYfX8swlh_fjHAlwoA8XMsw1p_wIVoHwfjlsFmIsf4ERLAJNt1rjE07I5B7-Ks5UNHs0G-UOD3ztKdUCvfYrQKrrGVqp7CmMbd42Jxe2PF1ShNo7HlbBuKN1IUmDNXRYvPmWjep9HazXKGHnP36Jia2XoGRS5c9aa3jDEfootqseWjGvPNmDfXxon3jWXLaPrzKPjywbzgn90rM-Ycm_EM8RIiPLcSrLMQzFuJZC8miheNJftpCsspCMrBw-EjplZizEE9YSHEYoTgIxiaSVSaSGRPJrIl00cTxQ2raRLrKRDowcfhI7JWYM5FM9cMgZhhxzgM69pGu8pHO-EhnfWSLPo4ftNM-slU-soGP3VN9ssScj3TCRxwFJGSUsyDC0dhJtspJNuMkm3WSLzo5ThemneSrnORDJ0dPFr7GSTbhZIjCkCMUBxxhFE48ZfgqL_mMl3zWy3DRy3HWM-1luMrLcODlOCkJ13jJx16GOGAhi2lAwhiHMWEXhWetDGesDMdWnm3QX6ipt10SDHfi_69fZZylTetXqOfpwvuXqP2o2Tx8LD_JgK9ZovbDFiCG-uNVUfO4umKJ2g9bYBjqTxpBr1mi9sMWIIb6kxDsiiVqP2qBYSg_dTP4NUvUftgCw1B_0ofwmiVqP2wBYqj_BnGc06KZ8T0zpZ2t7tqjHyp7Xr2h3NubvsSCLrJMVHyJjnyMDl-kwwt0ZA0d_RgduUhHFujoGjr2MTp6kY4u0LE1dPxjdOwiHVug42vowo_R8Yt0fIEuXEMXfYwuvEgXLtBFa-jij9FFg4ztdW_sjCSeIbmQrv29Xgf13gW1nzZ3b4OG36ybLDHab0yD-PAL4AdJCsf-THMLgt03tDbqH8_J9-dnZEXXwJtsS7KYxOJGblFIIoRiyulNsU1RTJgMkiBMEoZkQgUORJRlKedZnMXoRm1xgFmAcBxgFlK64XGAwpQSnqcxy5kANJClUHrT3j9j9zfKuVpuEaZRiG-0SKR27Tf9GFfyCbZXAcaA3d_YbfspflLvXdMBlPPuTcYrr-W2tSoVFSzFo4QCltI5aGoPTQ6d_LNuP1c3Oew6joOpqF37_beBB2Nev4UXXpnqprZ6O_M3AqfP-s__PKAFdgA_HFv0c4v_GwAA__8kHJ0_">