<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/55237>55237</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            Missed vectorization of eigen after https://reviews.llvm.org/D119965
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          weiguozhi
      </td>
    </tr>
</table>

<pre>
    After the patch https://reviews.llvm.org/D119965, we observed performance regression in one of our eigen applications. It is because of missed vectorization in the new code.
```
Before this patch
     │ 80:   mov      -0x58(%rsp),%rdx                                                                                                                                                      
     │       mov      -0x60(%rsp),%r9                                                                                                                                                       
     │       mov      -0x40(%rsp),%r15                                                                                                                                                      
0.63 │ 8f:   mulps    %xmm8,%xmm9                                                                                                                                                           
0.44 │       movups   (%rdx,%r9,4),%xmm1                                                                                                                                                    
0.36 │       addps    %xmm9,%xmm1                                                                                                                                                           
0.25 │       movups   %xmm1,(%rdx,%r9,4)                                                                                                                                                    
0.75 │       mulps    %xmm8,%xmm3                                                                                                                                                           
0.41 │       movups   (%rdx,%r10,4),%xmm1                                                                                                                                                   
0.86 │       addps    %xmm3,%xmm1                                                                                                                                                           
0.70 │       movups   %xmm1,(%rdx,%r10,4)                                                                                                                                                   
0.43 │       add      $0x8,%r9                                                                                                                                                              
0.28 │       add      -0x48(%rsp),%rbx                                                                                                                                                      
0.07 │       cmp      %r15,%r9                                                                                                                                                              
0.14 │     ↓ jge      3e5                                             
             ...

After the patch
0.33 │ 80:   mov      %r12,%rdx                                                                                                                                                             
 0.32 │       or       $0x1,%rdx                                                                                                                                                             
 0.32 │       mov      %r12,%rdi                                                                                                                                                             
 0.39 │       or       $0x2,%rdi                                                                                                                                                             
 0.38 │       mov      %r12,%rcx                                                                                                                                                             
 0.30 │       or       $0x3,%rcx                                                                                                                                                             
 0.37 │       mov      %r12,%rbp                                                                                                                                                             
 0.39 │       or       $0x4,%rbp                                                                                                                                                             
 0.31 │       mov      %r12,%r10                                                                                                                                                             
 0.27 │       or       $0x5,%r10                                                                                                                                                             
 0.31 │       mov      %r12,%r9                                                                                                                                                              
 0.37 │       or       $0x6,%r9                                                                                                                                                              
 0.29 │       mov      %r12,%r8                                                                                                                                                              
 0.35 │       or       $0x7,%r8                                                                                                                                                              
 0.34 │ b1:   mulss    %xmm8,%xmm13                                                                                                                                                          
 0.39 │       addss    (%r11,%r12,4),%xmm13                                                                                                                                                  
 0.33 │       movss    %xmm13,(%r11,%r12,4)                                                                                                                                                  
 0.33 │       mulss    %xmm8,%xmm12                                                                                                                                                          
 0.41 │       addss    (%r11,%rdx,4),%xmm12                                                                                                                                                  
 0.38 │       movss    %xmm12,(%r11,%rdx,4)                                                                                                                                                  
 0.31 │       mulss    %xmm8,%xmm3                                                                                                                                                           
 0.39 │       addss    (%r11,%rdi,4),%xmm3                                                                                                                                                   
 0.35 │       movss    %xmm3,(%r11,%rdi,4)                                                                                                                                                   
 0.41 │       mulss    %xmm8,%xmm4                                                                                                                                                           
 0.31 │       addss    (%r11,%rcx,4),%xmm4                                                                                                                                                   
 0.31 │       movss    %xmm4,(%r11,%rcx,4)                                                                                                                                                   
 0.34 │       mulss    %xmm8,%xmm5                                                                                                                                                           
 0.41 │       addss    (%r11,%rbp,4),%xmm5                                                                                                                                                   
 0.34 │       movss    %xmm5,(%r11,%rbp,4)                                                                                                                                                   
 0.32 │       mulss    %xmm8,%xmm6                                                                                                                                                           
 0.34 │       addss    (%r11,%r10,4),%xmm6                                                                                                                                                   
 0.38 │       movss    %xmm6,(%r11,%r10,4)                                                                                                                                                   
 0.35 │       mulss    %xmm8,%xmm7                                                                                                                                                           
 0.43 │       addss    (%r11,%r9,4),%xmm7                                                                                                                                                    
 0.38 │       movss    %xmm7,(%r11,%r9,4)                                                                                                                                                    
 0.41 │       mulss    %xmm8,%xmm1                                                                                                                                                           
 0.36 │       addss    (%r11,%r8,4),%xmm1                                                                                                                                                    
 0.32 │       movss    %xmm1,(%r11,%r8,4)                                                                                                                                                    
 0.39 │       add      $0x8,%r12                                                                                                                                                             
 0.39 │       add      -0x18(%rsp),%rbx                                                                                                                                                      
 0.02 │       cmp      -0x60(%rsp),%r12                                                                                                                                                      
 0.31 │     ↓ jge      510                                                                    
              ...
```

Following is simplified IR before the first LICM. The original code is much more complex, and multiple optimizations are involved, so only related instructions and control flow are listed. The "or" instructions actually represents a group "or" instructions computes consecutive memory addresses.
```
LoopHeader1:
  br %cond1, label %PreHeader2, %LoopExit1

PreHeader2:
  br label %LoopHeader2

LoopHeader2:
  br %cond2, label %LoopBody2, label %LoopExit2

LoopBody2:
  %100 = or i64 %24, 1
  ... // uses of %100
  br label %LoopHeader2

LoopExit2:
  %200 = or i64 %24, 1
  ... // uses of %200
  br label %LoopHeader1
```

Without this patch:
  * First LICM of loop2, the definition of %100 is moved to PreHeader2. So now the definition of %100 dominates %200.
  * LoopRotate of loop1, BB LoopHeader1 is duplicated into predecessors and deleted, PreHeader2 becomes the new loop header of loop1, so now it's more obvious that %100 dominates %200.
  * GVN, because the definition of %100 dominates %200, %200 is deleted, all uses of %200 are replaced by %100.
  * SLPVectorizer, it is easy to point out the memory addresses computed by the group of "or" instructions are consecutive, so the load/mul/add/store sequence are vectorized.
  
With this patch, we have following different behavior:

  * First LICM of loop2, speculation is disabled by the patch https://reviews.llvm.org/D119965, the definition of %100 is not moved, so code is not changed.
  * LoopRotate of loop2, LoopHeader2 is duplicated into predecessors and deleted. Pay attention that there is a new pre header for loop2 is created. Now we have following code
```
LoopHeader1:
  br %cond1, label %PreHeader2, %LoopExit1

PreHeader2:
  br %cond2, label %NewPreHeader2, label %LoopExit2

NewPreHeader2:
  br %LoopBody2

LoopBody2:
  %100 = or i64 %24, 1
  ... // uses of %100
  br %cond2, label %LoopBody2, label %_crit_edge

_crit_edge
  br label %LoopExit2

LoopExit2:
  %200 = or i64 %24, 1
  ... // uses of %200
  br label %LoopHeader1
```
  * Second LICM of loop2, this time the definition of %100 is moved to NewPreHeader2, but this time it doesn't dominate %200.
```
LoopHeader1:
  br %cond1, label %PreHeader2, %LoopExit1

PreHeader2:
  br %cond2, label %NewPreHeader2, label %LoopExit2

NewPreHeader2:
  %100 = or i64 %24, 1
  br %LoopBody2

LoopBody2:
  ... // uses of %100
  br %cond2, label %LoopBody2, label %_crit_edge

_crit_edge
  br label %LoopExit2

LoopExit2:
  %200 = or i64 %24, 1
  ... // uses of %200
  br label %LoopHeader1
```
  * LoopRotate of loop1, LoopHeader1 is duplicated into predecessors and deleted, PreHeader2 becomes the new loop header of loop1. %100 still doesn't dominates %200
```
NewPreHeader1:
  br label %PreHeader2

PreHeader2:                        // It's actually loop header of loop1
  br %cond2, label %NewPreHeader2, label %LoopExit2

NewPreHeader2:
  %100 = or i64 %24, 1
  br %LoopBody2

LoopBody2:
  ... // uses of %100
  br %cond2, label %LoopBody2, label %_crit_edge

_crit_edge
  br label %LoopExit2

LoopExit2:
  %200 = or i64 %24, 1
  ... // uses of %200
  br %cond1, label %PreHeader2, label %LoopExit1
```
  * GVN, because the definition of %100 can reach %200 but doesn't dominate %200, so GVN adds a new definition in the other path PreHeader2 -> LoopExit2, it's a critical edge, so it's splitted. New definition of the "or" instruction is inserted in the new BB. A PHI instruction is inserted in LoopExit2.
```
NewPreHeader1:
  br label %PreHeader2

PreHeader2:                        // It's actually loop header of loop1
  br %cond2, label %NewPreHeader2, label %LoopExit2.crit_edge

NewPreHeader2:
  %100 = or i64 %24, 1
  br %LoopBody2

LoopBody2:
  ... // uses of %100
  br %cond2, label %LoopBody2, label %_crit_edge

_crit_edge:
  br label %LoopExit2

LoopExit2.crit_edge:
  %150 = or i64 %24, 1
  br label %LoopExit2
  
LoopExit2:
  %200 = phi i64 [%150, LoopExit2.crit_edge], [%100, _crit_edge]
  ... // uses of %200
  br %cond1, label %PreHeader2, label %LoopExit1
```
  * SLPVectorizer, when checking addresses of memory accesses, it got a group of PHI instructions, it is not understood by SLPVectorizer, so the memory load/store can't be vectorized. So we got the regression.
 
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJztXFuPozYU_jXkxVrEJeTykIeZ7k470na1aqvt44qLk7glmGKTmemv77ENBIJNklVVyKpHo0wAX87nc_E5xk5Ek7fNw5bjAvE9RnnI4z3ac54zy3-wvCf4K_CR4Bdmp-nxYNNiB7feu-56vQgs7wf0ghGNGC6OOEE5Lra0OIRZjFGBdwVmjNAMkQzRDIptES0LhMkOZyjM85TEIYfnzEbPHBGGIhyHJZMFD4QxaPCIY04L8rcsJ9oRPGb4BcU0wbblvLecB2vhVH_y8hEDCxgKQoMSjbqNBFkfPGs9t1YeWjkAD-4c6FE-Qe-c12BlefAXFCy3vDVgE9-TVzRJ0qJS1MYEo9LHtB6H5Yt0Jaa5DpMbjMPzJVKYHHvht7VvW2lfmeZMlvKC18NhpbDAt6lKSFCNaD7XSamUgCoBJa-1xsH_eSMsAOiOicBENTJ_0UMWJklbUhMHUlGNxwuGJCVRSDx6mY2JwEQ1sqUGmcmm_BH5vUSNTbnX25Tr3INR1chWF23KnzSOmhrNc77FphqZjQnBQI0O-jpJVWW8ufO6mngYUVHj_VZmPCKa0EV90aSjPsd2lj1M8SGvysho6L5k5J7HEuJqtbDWPvpjh1VZH98W4rWiyZpsu84Z5OdZ2tNM__5QoiCH15t4dlBRNQQAyevpCy3qQsKm3fsHZJAQGY3Za-gEaH1BQncHqO929RKK70Xl-jN-V0L-vQHqTyJ6CUX5aMxeQ9fb0PzeAGnTgapQW0KuMxqz11ADyOurXFdCwZ0BulpCdxGI6Z1CV0KLOwPk9Z2CXkKr0Xi9ik4S6q95dCW0vDdA7eA_cpulUaZdxnEnvI4zNBNBzlkDkummW4fcUgG76zgThHiC1l8eAHtqy8r1TysffZQjYjDQIDSjGnoDLY5MDSDNcqJRDeUaVVcNJwjxQnrRUUNPo4YNyhExGGgwpjCp4QQ9RUPf5A0Tcq6GU4Q4NBmfqaHOGTYgR4RgoiHnYVTD-YgMX6IhuzKqYdzzhlOEeCENactqrlHDeLrO0BAdKjKq4UT3AEj6pkk5ys_VcIoQB2XVVcNAo4YNyBEhmGhwwdmkhosRGb5EQ7Iypyi9V81ThHhDbLjQZSiTfzdriDdMargckeFLdPKG2rfNejXs7SKaJMIb1HCpUcPpb7u5MTac_m4OZNrypVfD1T3su7n0qrSTKWvUcHUPaqhPLKtC7a0qU1zKaNFVgN45r-797VUBUE5fAZvNKqYtylMVmDnj0uxYCf6dt1iajSytnSzd3e_q84mmKX0h2U7sqmfkkKdkS3CCnn9BUb09HqMtKRhHH59_-NlGv8E1LciOZGEqN9eLmocy3oOvgOIxhTawSBdRmCXC13MCNxDNOTlU2_MZCqEkyY40PeJEFGUU0Sx9QwVOQw7dk4zxooyrwtBOTDNe0BRtgVtZOyUMCip2LM-jBXycVYt5Gaay0bzADGcc7qFdQcvcUEPwXnIsvmQMxyUnR4wOGHC9CeMShxQw04_mR0rzn3CY4EK8lajlEBXCb0JriXCWKA0jnIo7nwusCouFR3FDVP_wSrjblk2rVKfJpplTp167Xvu2jhevw4so_UiTt_5dwVCvYVX01CyUdR0HWf578W6JLObijifWEJBblwEVROqACCphBMWpDVXtRlCKoU7f3rf17V3q2x2wmd8J39OStw-OtHl6QE-NuYjuUmhUDq6wpARvSUbkGZVmFKQBUXEqhlN0ErqNfqUoA3U31kvoAaxQKKyCZLeZEFh-oRwe10xIHXx8RC2UouukVOdrpNkBB2AsCY5B12mhbC_BKebKTk_ciSM49ABd1-dsRA9oLx92OmQKBOGWt2TKR9DoSGgpqob8Kiw_fvkkmqpP_Vw9IJV1eWqMWzjAL5ypg3Qq4CnSMIaBiN6qJjts_Prx85fqmBEuRDNEHkbCIXsTosspjB9SmtH3G7V3kY2LAsoTSQa07ks608YPVUMpKqY0BBBP4FrhE9qHT8bFuDL8V4nFiSpRtz4QBU6ywdCob1t31cmsfQi-btvMBgnZbnEBPhMGHR4RYLDW8Ss0neXAdVodxYKBJyyM0hPy24-ODZlORrkyn2qI6jlJ3I_3YbZrj4DWMCTPLb9zi1XY6HMIUuYcxkqwJnUa2C0kD6E0DahbWwZMqapL8TQucCib-AQG0peBADL6ZKOdMz7hl26rw_NGt_h586dZ5T-aam6YBr_GBeFfcbLDbebO7_YnEe3UOfr0VXkxLNDrpidQSQjSBvxre6Lq6UBUT4myDXCNCcUsA6_PG7_c9e3fqVZfo6Y3av7_Wn1Rqw3hzn8X69i14BknEF30lZ91MJ6BaCuTawj3W9pmUnFjYqgG-1kFYU1mpI3Z_reSO7KSK3xjj7khM7o20o7DDKLlEMK4Conw_kaHX0Vm0LhcK63ColbD1Xl9KgInER7u29b3zvI_oNMgyrhb6TESwgB7TpEUiOqlesjA0rmKrrp9AQhuWDQQLgIucaEcRGPvj482ekCff3oeKtxwqJ_evisLt7W28R3bunkBxmzutq664DK4PBamLuok7oJPyfdEtR08qg7rufCcseC9DJ9UMWWoXztPR_RFvXT7ZY8zSOlw_KdIj06ZtfgVkCrbjmN5q0rOd5AChqdE-8yA61JVrlhmwCUk0lTmqb3Oq-S76qjKwVXeDc5Qer2ok3WL5RtI6gQPouLpV07qbHSWbPxk7a_DGSc8xZufdb9kAmxXP4MijyBemzTPyiLddAvvIO8vIxsiGbgQNap_7_KC_gF9wiUwUIrBewoCz1_O9ht_NY8Xy0Ucr5MwcuMFDlYhTvy1s9r6yyVez6RA2UYqkCdcpWwCvoPuzMjGczzPCeDD8wI_sOfLZbJIkngVLDzszkNr7uBDSNIGwKzYSJaicsfgoVjkPaGbhTB8uwxj2R20H5Z8T4vNCwxQSf_ek5nsfCOZ_wfpfcc8">