<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/66185>66185</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            Simplification of arguments of `fshl` leads to unoptimized assembly

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            new issue

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          quic-eikansh

      </td>

    </tr>

</table>

<pre>

    In the code mentioned in the [Godbolt link](https://godbolt.org/z/7Y7h8e9c1) it can be seen that clang generates four `ldrb` to load `W[0]`.  In comparison, GCC 12.1 generates one `ldr` for the same test case. 

The reason for the unoptimized code is the simplification of the arguments of `fshl`. The LLVM IR of the test case is in this [Godbolt link]( https://godbolt.org/z/P3E8Trqc7) . The IR of interest is:

```

%or15 = tail call i32 @llvm.fshl.i32(i32 %conv9, i32 %or10, i32 25)

```

Instead, if we change the first arguments of `fshl` to `%or10`, the optimal assembly is generated.

### Test case

```c

#include <stdint.h>

#define GET_DATA(n,b,i)                     \

{                                           \

    (n) = ( (uint32_t) (b)[(i)] << 24 )     \

        | ( (uint32_t) (b)[(i) + 1] << 16 ) \

        | ( (uint32_t) (b)[(i) + 2] << 8 )  \

        | ( (uint32_t) (b)[(i) + 3] );      \

}

int test( unsigned char data[64] )

{

    uint32_t W[64];

    GET_DATA( W[ 0], data, 0 );

#define SHR(x,n) ((x & 0xFFFFFFFF) >> n)

#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))

    return ROTR(W[0], 7);

}

```

### LLVM IR for the test case

```

target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"

target triple = "aarch64-unknown-linux-gnu"

define i32 @test(ptr %data) {

entry:

  %0 = load i8, ptr %data

  %conv = zext i8 %0 to i32

  %shl = shl nuw i32 %conv, 24

  %arrayidx1 = getelementptr inbounds i8, ptr %data, i64 1

  %1 = load i8, ptr %arrayidx1

  %conv2 = zext i8 %1 to i32

  %shl3 = shl nuw nsw i32 %conv2, 16

  %or = or i32 %shl3, %shl

  %arrayidx4 = getelementptr inbounds i8, ptr %data, i64 2

  %2 = load i8, ptr %arrayidx4

  %conv5 = zext i8 %2 to i32

  %shl6 = shl nuw nsw i32 %conv5, 8

  %or7 = or i32 %or, %shl6

  %arrayidx8 = getelementptr inbounds i8, ptr %data, i64 3

  %3 = load i8, ptr %arrayidx8

  %conv9 = zext i8 %3 to i32

  %or10 = or i32 %or7, %conv9

  %or15 = tail call i32 @llvm.fshl.i32(i32 %conv9, i32 %or10, i32 25)

  ; Uncomment below line and comment above line to see the difference in asm

  ;%or15 = tail call i32 @llvm.fshl.i32(i32 %or10, i32 %or10, i32 25)

  ret i32 %or15

}

declare i32 @llvm.fshl.i32(i32, i32, i32)

```

### Assembly generated by armv8-a clang for the test case

```

test(unsigned char*):                              // @test(unsigned char*)

        ldrb    w8, [x0]

        ldrb    w9, [x0, #1]

        lsl     w8, w8, #24

        orr     w8, w8, w9, lsl #16

        ldrb    w9, [x0, #2]

        orr     w8, w8, w9, lsl #8

        ldrb    w9, [x0, #3]

        orr     w8, w8, w9

        extr    w0, w9, w8, #7

        ret

```

### Assembly generated after making both arguments of `fshl` as same

```

test:                                   // @test

        ldr     w8, [x0]

        rev     w8, w8

        ror     w0, w8, #7

        ret

```

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy8WF1zozoS_TXyS5ddIAGGBz_4I55N1Wzt1kx2p-7TlIC2rRssZSSROPPrb0mAg4md5GZyJ-UgI7fO6dNqqQXcGLGViDMSL0i8GvHa7pSe_ahFMUZxy6XZjXJVPs6uJdgdQqFKhD1KK5TEEkTTS-LFJ1XmqrJQCXlL4hWh6c7aO0PYnNA1oett8_tE6S2h65-Erqd_THcpZkVIaAbCQsEl5AgG0YFyC0XF5Ra2KFFziwY2qtZAkqAqdU6SAKyCSvHSdX0j8SJwtEkwAbiWUKj9HdfCKEnoEj4tlxDSSdgDUxJbLAe1UdoLMXyPYNE4bwxOgAQrEsyb680OQSM3Sh7Na6nurNiLn1g2kRGmgRH7u0psRMFdnEBtfC_X29qFzrgOkgQbs6u8ww758-f__xuuv3S2Rx8cpI-yMOfDDK_F-b_sKr3RP4qpi3ND1vAIaVE7HuGHN1KToP00tzRWOoyBsBVYLiooeFWBYBRIFFTV_X7iREwEo4SmvpvGhZL3mYt6e690GHS3NCY0O8t0LY1FXnrDDTwgFDsut-iDsRHa2Evhc2ngUFqixHO5UX5qeAXcGNzn1aOLZDf_5aQ_s4Sy5gM3XdgHPhZHQyGLqi4RCFsaWwppJzvCrgZoJW6ERPh0dfN9Nb-ZE5q6NMwJXQo3CWRJyTz45SZetoTTxUdBfmhz9O-02wcj8ynl0pfQtBbSMvrd-m6a5i5H4oXLKP9t5YJN2BJoBGfCd4HmtJku30QGhC4g7FGGSUP54SS0R5L2ZH08E_NMNCPsfJr08mjVz2Mhrd-GHFEtfZko3arUUHLLSbxIog65S8NznncOwrduCGGLk_VyYt5bM34E-G2dLhtSuoSg1XJ-yX391xdC0wOhS9lGxN0CoQkEh3X716TfFWFXIJ_cP2J8-c_NEGQI28T_0E0goSmjMG7Qms9FgRptrWVHcixcdAnTE2HHyRjsyIMdq6sbXUmyF3aw5tZyvUXrY1nxR1XbdhlSHO8Jm-NYpITN3T-jYxEmhM39xd0lEWHzJBqLkDqDkKZjyWjT99X1UXpCYrW4q7Aj4FwXuyQa1_JWqgc5roSsD-OtrI_Dmms7B22JafPvzmpXSZoUcOFvg4TS6sdj7QJnE3hCfy4QqYtqf-yTmStS3vInHiyItBlqlSPumZld5a1cK-sH6JU4h02jni3Xmj-K8hD6EVu0WKGrV84BIXNVy9Kc8ckVvCSCsIcUXtBwZBgIoUMl4Vkl7ESKNCdyqOMJk94Ipb290p2Zg3BWzdczyqN3Ke97SV9RHg2Ux0Pl9Kzy5CXlseNJT4RPB8qVftKdnBGevks46yGxV4SnA-HZUDh7Ltydh54JmbZKmlPaifE_cswDcHXnf7JQexcZyLFSD-74isClOzU33TxX99h0W-WeAfxmVorNBjXKAt0ZmJt9D_M9Pvd9fMlljbZnE58tjyUWFdf4AmMLfWzOH3yHG_q8O64ez6qQPwLX-_t0zNtnojdu9s3meVK7CZ37KjP_befA5rGkt5mf8-fVE4975jvtevDLhMSLg6-f70HInhD8omDh25BMNehpfGk9oqwrCy-BKK1fAGlc80SUdbvyr6qjb1L3NxxLP8Yv9jF-vYqABzuAeAieNB1nb_o6kkb73rXMNxY17PmtkFvIld1dfK7lxr-OuLy0f-MiHjana_otWXB27t6-ejXeX5r918eqZ-zBr8_4qJyxMmMZH-EsTLIozmKaxaPdLI5YltI8ZryI0mkQFWla0CRLEgzCKCs2IzGjAWVBFrIgjaM4nmAapjwtaVkmEVKWkSjAPRfVxBcVpbcjYUyNsyQJ03hU8Rwr41_YUSrxAfyP7hgdr0Z65saM83prXFESxponFCtshbOvz15OXcrACnlpXD3uv-fqXqeMal3NBu-ehN3V-aRQe0LXjrVtxnda_YmFJXTtfTWErr2WvwIAAP__3htENQ">