<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - 265 bit double shuffles not optimal"
   href="https://bugs.llvm.org/show_bug.cgi?id=33434">33434</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>265 bit double shuffles not optimal
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: X86
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>tobias@grosser.es
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Hi,

I just tried to generate AVX2 code for some 256 bit AVX2 double shuffles, but
despite Chandler's outstanding work on improving X86 shuffles two years ago,
the shuffle sequences seem not be be optimal (using llc out.ll -o -
-mcpu=x86-64 -mattr=+avx2 on r304555).

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"                     

define void @test_0(<4 x double>* %PA, <4 x double>* %PB) {                     
entry:                                                                          
  %A = load <4 x double>, <4 x double>* %PA                                     
  %B = load <4 x double>, <4 x double>* %PB                                     
  %SA = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 0, i32
4, i32 2, i32 3>
  %SB = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 1, i32
5, i32 6, i32 7>

;       vmovddup        %xmm1, %xmm2    # xmm2 = xmm1[0,0]
;       vblendpd        $2, %ymm2, %ymm0, %ymm2 # ymm2 =
ymm0[0],ymm2[1],ymm0[2,3]
;       vpermilpd       $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
;       vblendpd        $1, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0],ymm1[1,2,3]

  store <4 x double> %SA, <4 x double>* %PA                                     
  store <4 x double> %SB, <4 x double>* %PB                                     
  ret void                                                                      
}                                                                               

define void @test_1(<4 x double>* %PA, <4 x double>* %PB) {                    
 entry:                                                                         
  %A = load <4 x double>, <4 x double>* %PA                                     
  %B = load <4 x double>, <4 x double>* %PB                                     
  %SA = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 4, i32
5, i32 0, i32 6>
  %SB = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 2, i32
3, i32 1, i32 7>

;       vinsertf128     $1, %xmm0, %ymm0, %ymm2
;       vpermilpd       $2, %ymm1, %ymm3 # ymm3 = ymm1[0,1,2,2]
;       vblendpd        $4, %ymm2, %ymm3, %ymm2 # ymm2 =
ymm3[0,1],ymm2[2],ymm3[3]
;       vpermpd $222, %ymm0, %ymm0      # ymm0 = ymm0[2,3,1,3]
;       vblendpd        $8, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2],ymm1[3]

  store <4 x double> %SA, <4 x double>* %PA                                     
  store <4 x double> %SB, <4 x double>* %PB                                     
  ret void                                                                      
}                                                                               

define void @test_2(<4 x double>* %PA, <4 x double>* %PB) {                     
entry:                                                                          
  %A = load <4 x double>, <4 x double>* %PA                                     
  %B = load <4 x double>, <4 x double>* %PB                                     
  %SA = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 0, i32
1, i32 4, i32 5>
  %SB = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 2, i32
3, i32 6, i32 7>

;       vinsertf128     $1, %xmm1, %ymm0, %ymm2
;       vperm2f128      $49, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[2,3]

  store <4 x double> %SA, <4 x double>* %PA                                     
  store <4 x double> %SB, <4 x double>* %PB                                     
  ret void                                                                      
}

Am I missing something or could these really be translated to at most two
vblendpd instructions?

Best,
Tobias</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>