<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/66185>66185</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Simplification of arguments of `fshl` leads to unoptimized assembly
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
quic-eikansh
</td>
</tr>
</table>
<pre>
In the code mentioned in the [Godbolt link](https://godbolt.org/z/7Y7h8e9c1) it can be seen that clang generates four `ldrb` to load `W[0]`. In comparison, GCC 12.1 generates one `ldr` for the same test case.
The reason for the unoptimized code is the simplification of the arguments of `fshl`. The LLVM IR of the test case is in this [Godbolt link]( https://godbolt.org/z/P3E8Trqc7) . The IR of interest is:
```
%or15 = tail call i32 @llvm.fshl.i32(i32 %conv9, i32 %or10, i32 25)
```
Instead, if we change the first arguments of `fshl` to `%or10`, the optimal assembly is generated.
### Test case
```c
#include <stdint.h>
#define GET_DATA(n,b,i) \
{ \
(n) = ( (uint32_t) (b)[(i)] << 24 ) \
| ( (uint32_t) (b)[(i) + 1] << 16 ) \
| ( (uint32_t) (b)[(i) + 2] << 8 ) \
| ( (uint32_t) (b)[(i) + 3] ); \
}
int test( unsigned char data[64] )
{
uint32_t W[64];
GET_DATA( W[ 0], data, 0 );
#define SHR(x,n) ((x & 0xFFFFFFFF) >> n)
#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))
return ROTR(W[0], 7);
}
```
### LLVM IR for the test case
```
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"
define i32 @test(ptr %data) {
entry:
%0 = load i8, ptr %data
%conv = zext i8 %0 to i32
%shl = shl nuw i32 %conv, 24
%arrayidx1 = getelementptr inbounds i8, ptr %data, i64 1
%1 = load i8, ptr %arrayidx1
%conv2 = zext i8 %1 to i32
%shl3 = shl nuw nsw i32 %conv2, 16
%or = or i32 %shl3, %shl
%arrayidx4 = getelementptr inbounds i8, ptr %data, i64 2
%2 = load i8, ptr %arrayidx4
%conv5 = zext i8 %2 to i32
%shl6 = shl nuw nsw i32 %conv5, 8
%or7 = or i32 %or, %shl6
%arrayidx8 = getelementptr inbounds i8, ptr %data, i64 3
%3 = load i8, ptr %arrayidx8
%conv9 = zext i8 %3 to i32
%or10 = or i32 %or7, %conv9
%or15 = tail call i32 @llvm.fshl.i32(i32 %conv9, i32 %or10, i32 25)
; Uncomment below line and comment above line to see the difference in asm
;%or15 = tail call i32 @llvm.fshl.i32(i32 %or10, i32 %or10, i32 25)
ret i32 %or15
}
declare i32 @llvm.fshl.i32(i32, i32, i32)
```
### Assembly generated by armv8-a clang for the test case
```
test(unsigned char*): // @test(unsigned char*)
ldrb w8, [x0]
ldrb w9, [x0, #1]
lsl w8, w8, #24
orr w8, w8, w9, lsl #16
ldrb w9, [x0, #2]
orr w8, w8, w9, lsl #8
ldrb w9, [x0, #3]
orr w8, w8, w9
extr w0, w9, w8, #7
ret
```
### Assembly generated after making both arguments of `fshl` as same
```
test: // @test
ldr w8, [x0]
rev w8, w8
ror w0, w8, #7
ret
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy8WF1zozoS_TXyS5ddIAGGBz_4I55N1Wzt1kx2p-7TlIC2rRssZSSROPPrb0mAg4md5GZyJ-UgI7fO6dNqqQXcGLGViDMSL0i8GvHa7pSe_ahFMUZxy6XZjXJVPs6uJdgdQqFKhD1KK5TEEkTTS-LFJ1XmqrJQCXlL4hWh6c7aO0PYnNA1oett8_tE6S2h65-Erqd_THcpZkVIaAbCQsEl5AgG0YFyC0XF5Ra2KFFziwY2qtZAkqAqdU6SAKyCSvHSdX0j8SJwtEkwAbiWUKj9HdfCKEnoEj4tlxDSSdgDUxJbLAe1UdoLMXyPYNE4bwxOgAQrEsyb680OQSM3Sh7Na6nurNiLn1g2kRGmgRH7u0psRMFdnEBtfC_X29qFzrgOkgQbs6u8ww758-f__xuuv3S2Rx8cpI-yMOfDDK_F-b_sKr3RP4qpi3ND1vAIaVE7HuGHN1KToP00tzRWOoyBsBVYLiooeFWBYBRIFFTV_X7iREwEo4SmvpvGhZL3mYt6e690GHS3NCY0O8t0LY1FXnrDDTwgFDsut-iDsRHa2Evhc2ngUFqixHO5UX5qeAXcGNzn1aOLZDf_5aQ_s4Sy5gM3XdgHPhZHQyGLqi4RCFsaWwppJzvCrgZoJW6ERPh0dfN9Nb-ZE5q6NMwJXQo3CWRJyTz45SZetoTTxUdBfmhz9O-02wcj8ynl0pfQtBbSMvrd-m6a5i5H4oXLKP9t5YJN2BJoBGfCd4HmtJku30QGhC4g7FGGSUP54SS0R5L2ZH08E_NMNCPsfJr08mjVz2Mhrd-GHFEtfZko3arUUHLLSbxIog65S8NznncOwrduCGGLk_VyYt5bM34E-G2dLhtSuoSg1XJ-yX391xdC0wOhS9lGxN0CoQkEh3X716TfFWFXIJ_cP2J8-c_NEGQI28T_0E0goSmjMG7Qms9FgRptrWVHcixcdAnTE2HHyRjsyIMdq6sbXUmyF3aw5tZyvUXrY1nxR1XbdhlSHO8Jm-NYpITN3T-jYxEmhM39xd0lEWHzJBqLkDqDkKZjyWjT99X1UXpCYrW4q7Aj4FwXuyQa1_JWqgc5roSsD-OtrI_Dmms7B22JafPvzmpXSZoUcOFvg4TS6sdj7QJnE3hCfy4QqYtqf-yTmStS3vInHiyItBlqlSPumZld5a1cK-sH6JU4h02jni3Xmj-K8hD6EVu0WKGrV84BIXNVy9Kc8ckVvCSCsIcUXtBwZBgIoUMl4Vkl7ESKNCdyqOMJk94Ipb290p2Zg3BWzdczyqN3Ke97SV9RHg2Ux0Pl9Kzy5CXlseNJT4RPB8qVftKdnBGevks46yGxV4SnA-HZUDh7Ltydh54JmbZKmlPaifE_cswDcHXnf7JQexcZyLFSD-74isClOzU33TxX99h0W-WeAfxmVorNBjXKAt0ZmJt9D_M9Pvd9fMlljbZnE58tjyUWFdf4AmMLfWzOH3yHG_q8O64ez6qQPwLX-_t0zNtnojdu9s3meVK7CZ37KjP_befA5rGkt5mf8-fVE4975jvtevDLhMSLg6-f70HInhD8omDh25BMNehpfGk9oqwrCy-BKK1fAGlc80SUdbvyr6qjb1L3NxxLP8Yv9jF-vYqABzuAeAieNB1nb_o6kkb73rXMNxY17PmtkFvIld1dfK7lxr-OuLy0f-MiHjana_otWXB27t6-ejXeX5r918eqZ-zBr8_4qJyxMmMZH-EsTLIozmKaxaPdLI5YltI8ZryI0mkQFWla0CRLEgzCKCs2IzGjAWVBFrIgjaM4nmAapjwtaVkmEVKWkSjAPRfVxBcVpbcjYUyNsyQJ03hU8Rwr41_YUSrxAfyP7hgdr0Z65saM83prXFESxponFCtshbOvz15OXcrACnlpXD3uv-fqXqeMal3NBu-ehN3V-aRQe0LXjrVtxnda_YmFJXTtfTWErr2WvwIAAP__3htENQ">