<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/78651>78651</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[AArch64] Vector load of `i4` elements leads to scalar code
</td>
</tr>
<tr>
<th>Labels</th>
<td>
backend:AArch64
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
dcaballe
</td>
</tr>
</table>
<pre>
Hi there,
We have some code in IREE that tries to emulate computation on `i4` data types. In particular, this is the LLVM-IR that we generate for a vector load:
```
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-linux-android34"
define <32 x float> @test(ptr noalias nonnull align 16 %ptr) {
%gep = getelementptr i8, ptr %ptr, i64 0
%ld = load <16 x i8>, ptr %gep, align 4
%bc = bitcast <16 x i8> %ld to <32 x i4>
%fp = sitofp <32 x i4> %bc to <32 x float>
ret <32 x float> %fp
}
```
When we pass it through the LLVM middle-end, we end up with:
```
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-linux-android34"
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
define <32 x float> @test(ptr noalias nocapture nonnull readonly align 16 %ptr) local_unnamed_addr #0 {
%ld1 = load <32 x i4>, ptr %ptr, align 16
%fp = sitofp <32 x i4> %ld1 to <32 x float>
ret <32 x float> %fp
}
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) }
```
and finally the backend generates the following when we compile it for Cortex-X3:
```
llc -mcpu=cortex-x3 test_opt.ll -o test.S
.text
.file "main_dispatch_1666"
.globl test // -- Begin function test
.p2align 4
.type test,@function
test: // @test
// %bb.0:
ldp x9, x8, [x0]
and w11, w8, #0xf
ubfx w10, w8, #4, #4
fmov s6, w11
and w11, w9, #0xf
fmov s2, w11
mov v6.b[1], w10
ubfx w10, w8, #8, #4
mov v6.b[2], w10
ubfx w10, w8, #12, #4
mov v6.b[3], w10
ubfx w10, w8, #16, #4
mov v6.b[4], w10
ubfx w10, w8, #20, #4
mov v6.b[5], w10
ubfx w10, w8, #24, #4
mov v6.b[6], w10
ubfx x10, x8, #28, #4
mov v6.b[7], w10
ubfx x10, x8, #32, #4
mov v6.b[8], w10
ubfx x10, x8, #36, #4
mov v6.b[9], w10
ubfx x10, x8, #40, #4
mov v6.b[10], w10
ubfx x10, x8, #44, #4
mov v6.b[11], w10
ubfx x10, x8, #48, #4
mov v6.b[12], w10
ubfx x10, x8, #52, #4
mov v6.b[13], w10
ubfx w10, w9, #4, #4
mov v2.b[1], w10
ubfx w10, w9, #8, #4
mov v2.b[2], w10
ubfx w10, w9, #12, #4
mov v2.b[3], w10
ubfx w10, w9, #16, #4
mov v2.b[4], w10
ubfx w10, w9, #20, #4
mov v2.b[5], w10
ubfx w10, w9, #24, #4
mov v2.b[6], w10
ubfx x10, x9, #28, #4
mov v2.b[7], w10
ubfx x10, x9, #32, #4
mov v2.b[8], w10
ubfx x10, x9, #36, #4
mov v2.b[9], w10
ubfx x10, x9, #40, #4
mov v2.b[10], w10
ubfx x10, x9, #44, #4
mov v2.b[11], w10
ubfx x10, x9, #48, #4
mov v2.b[12], w10
ubfx x10, x9, #52, #4
mov v2.b[13], w10
ubfx x10, x9, #56, #4
lsr x9, x9, #60
mov v2.b[14], w10
mov v2.b[15], w9
ubfx x9, x8, #56, #4
lsr x8, x8, #60
mov v6.b[14], w9
zip1 v0.8b, v2.8b, v0.8b
mov v6.b[15], w8
shl v0.4h, v0.4h, #12
sshr v0.4h, v0.4h, #12
sshll v0.4s, v0.4h, #0
ext v7.16b, v6.16b, v6.16b, #8
scvtf v0.4s, v0.4s
zip1 v3.8b, v6.8b, v0.8b
zip2 v1.8b, v2.8b, v0.8b
shl v3.4h, v3.4h, #12
shl v1.4h, v1.4h, #12
sshr v3.4h, v3.4h, #12
sshr v1.4h, v1.4h, #12
sshll v3.4s, v3.4h, #0
sshll v1.4s, v1.4h, #0
scvtf v4.4s, v3.4s
zip2 v3.8b, v6.8b, v0.8b
zip1 v6.8b, v7.8b, v0.8b
zip2 v7.8b, v7.8b, v0.8b
scvtf v1.4s, v1.4s
shl v3.4h, v3.4h, #12
shl v6.4h, v6.4h, #12
shl v7.4h, v7.4h, #12
sshr v3.4h, v3.4h, #12
sshr v6.4h, v6.4h, #12
sshr v7.4h, v7.4h, #12
sshll v3.4s, v3.4h, #0
sshll v6.4s, v6.4h, #0
sshll v7.4s, v7.4h, #0
scvtf v5.4s, v3.4s
ext v3.16b, v2.16b, v2.16b, #8
zip1 v2.8b, v3.8b, v0.8b
zip2 v3.8b, v3.8b, v0.8b
scvtf v6.4s, v6.4s
scvtf v7.4s, v7.4s
shl v2.4h, v2.4h, #12
shl v3.4h, v3.4h, #12
sshr v2.4h, v2.4h, #12
sshr v3.4h, v3.4h, #12
sshll v2.4s, v2.4h, #0
sshll v3.4s, v3.4h, #0
scvtf v2.4s, v2.4s
scvtf v3.4s, v3.4s
ret
.Lfunc_end0:
.size test, .Lfunc_end0-test
// -- End function
.section ".note.GNU-stack","",@progbits
```
It looks like there is a 64-bit pair load and then the 4-bit elements are extracted from them in a scalar fashion.
Could this be implemented in a vector fashion? Could this code be improve in any other way?
This operation (and the store counterpart, both on Neon and SVE) are really performance critical for us.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzcWU2To7oV_TXy5hYuEBhbCy_cX8lUTd7iTTLJrkvAxVZGSJQkuu359Snx4Y-2x9CvXjavaoYGce7R1dU5MghurdgqxDVZPJDF04w3bqfNush5xqXEWaaLw_rvAtwODRL6SMInEm66478RdvwNweoKIdcFglDw5ffnZ3A77sAZgRacBqwayZ2HVHXjuBNagVZA0lAkJA2h4I6DO9Ro5_BFQc2NE3kjuSH0EdxOWBDWJwBfv37_R_Dl947-HWGLCo1nLrUBDm-YO21Aal6QeHOeKUnD_l976bjZomv7lfygGwckfgJCKQYViTcYiBWJN_5_TAMRpSTetAd_lSYk3qRJICLqARFdBSqmXds330bpRSfOiFri0AHnJt-lSaC0wkAK1ewDrgqjRREnx8juWGAplA98jCnsoZSaOxI_A0lCh9YRuqqdAaW5FNyC0ko1UgKXYqsgSoHQRe0MoQzI8qFjBN-4xbpNZosOJVaonKcRK19rfzbEPYJIE-gL5ltl0cb56vqkohT2Pix-PovcYu2vuiSSs16zvA3OhMu5dZfxPbnTx7GKxNOeossuZSucbk_PUD35WfBQqCHeoLtRRE_aV3v5dFMkvcZ3qLzUam4tCAduZ3Sz3R3lCJUoCokBqsKP_B0BVQFNDe_C7f4CKiTxA7w0Km9Nu3HOWBJvoGqsq43eGrReeqVBBKUN5o2x_sweVA5KN-pdqALehZQGXWMUVFhpcyB0xc22Qj9MMMgLQtkfE33Oa9cYPMrfk2klD7d8IHXO5WujFK-weOVF4SUbhx_8IYvoQudngrxyyNDJySQjSvXkf4pU2yN3zoiscWj7gfjZXT78P2YH7ruEqwJKobiUh9YZGc9_eB8MC3S3fJdaSv0u1Bbee1f5XwQh0RvLL-GP2jjcB_-JT8a57E3KHIIqrxsSP-UdeB-DV8arrt1cSgh0ezn_diHikM0d7t3pqhQSScgIpRUX6rUQtuYu371GaZqeOYDNt1JnkoTMkwKhL4S-QBDAA26FgnIwRqvNY0xNW2GQkCVn_R9q7Hn8z2gSDsG9T317vBm6GPTehfdtdJFl8_BsUWGyqEnI9syLcd8u4WTxsA_J4qgTxlVBQvYeRe3y1GFoHO7LI6LJyn0LCS8gyfHvACwr_UZCZtMWF0W_6IPd6GMIpR9Du_a3dJ6RxUPkE28B4Vh2q6vsLpjodKaI3qeKP0GV3qdKplPR8D7V4hNU1zN5QZX-mmrfUe2PVCNlX06nikfKvvoE1UjZ2XSqZKTsUfgJrpG6R3fkfsU1UvjojuA_ci1GKh9NUDz75SLRU9GphmZjhqZTDc1GDU2nGpqNGppONTQbNTSdamg2amg61dBs1NB0qqHZqKHpVEOzUUPTqYZmo4amkw3NRg1NJxuajRqaTjY0GzU0HTX0Fdd16aU1p4eMAXd6_vvQ1y1LXCKOSmdXybDLheqXuawucNe5pB9yOfX0U9SRR4TzVeZvvdHhpG25zXPMeHUE2J3saJJdH92ddOvPEWV3ZhpMDnT2I-40uPYJlr0t51HapZxen7Ur6ZE3f3PlR157VYt4KEF6sxY_RU09LLpfsr4i8TDU-PZQO1Q0oKK7dRsj62HjbHLIzX6kC69h0QCLbsKGoiZnbPaqWuNFjTqF9XeXd2u_vA875nSeuv1jU5MOqPQeajmgln_GBI712cPGO_3UPKcDLL0LWw6w5V05LG7KofdsfHQqvT678OwgjKPJ4rvCiO_Djtmdj9Ve3z4f45Vu6FB2ek8RE6d6jGyycOSQm_1Id2MORxQxlOGc7UaV4ptTbLB_T59_9e_0r6iKi5f0uRU_z1794QwWnF7yzzYXnlUBl7sDLQv2DYxQOlfa4fxvv_0rsI7nPwj1jwDtkXa7C7XR20w4e2fT5osDqfUPC1L8wG6DH4QFDmkSZMJBzUW3mQ5cFf6-andxupv93rEFbhBw7wzPHRZQGl15VAVCAQebc8kNlNzuhFbzrt9H3cii29fPEERVd1RYdDH9Ln4fQ-IXOAtovzJ0UUa_tR8cuDqA9tnDOz-Q-KXr5J8erWs03ecGQlf9IMA6bRBy3SiHpuamnZNMux1oBb-hVu1wv31_JpS1ozPY7mvVaEptKq5yhNwIJ3Iu212rxs5nxTouWMz4DNfRMlykNIlWy9luHeMqSfKYhkWaUM4XYV7mSVQmZZ7nLFplM7GmIU3CKGJhSFmynCeLZYZL5IssLxNWFCQJseJCzqV8q-babGfC2gbXy1W6iGaSZyht-_GG0n7XjcSbzabd4PVqWDzNzNrHBlmztSQJpbDOnticcLL9-jPELJ7g--lDCujy9KHmOOkSedF-2ukn2E_LrDFyvXOutl78rZi3wu2abJ7ritAX32P_J6iN_i_mjtCXdjCW0Jd2PP8LAAD__-j2Bls">