<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/61315>61315</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Incorrect code generated from LLVM IR on big-endian powerpc64 for shuffle of vector created from zero-extended extracted element
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
johnplatts
</td>
</tr>
</table>
<pre>
Here is some LLVM IR code that generates incorrect code on big-endian powerpc64 targets:
```
target triple = "powerpc64-unknown-linux-gnu"
define dso_local <16 x i8> @ConvertExtractedMaskBitsToVect(<16 x i8> noundef %0) local_unnamed_addr #0 {
%a4 = extractelement <16 x i8> %0, i64 7
%a5 = zext i8 %a4 to i16
%a6 = insertelement <8 x i16> poison, i16 %a5, i64 0
%a7 = bitcast <8 x i16> %a6 to <16 x i8>
%a8 = shufflevector <16 x i8> %a7, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%a9 = and <16 x i8> %a8, <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128>
%a10 = icmp eq <16 x i8> %a9, <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128>
%a11 = sext <16 x i1> %a10 to <16 x i8>
ret <16 x i8> %a11
}
attributes #0 = { mustprogress nofree nosync nounwind willreturn uwtable
"frame-pointer"="none" "min-legal-vector-width"="128"
"target-cpu"="pwr10"
"target-features"="+altivec,+power10-vector,+power8-vector,+power9-vector,+vsx" }
```
Here is the POWER10 assembly code that is generated when the above code is compiled with llc 17.0.0 (llvm ```1:17~++20230308042327+f2c1b1a7f54e-1~exp1~20230308042449.790``` on Ubuntu 22.04):
```
.text
.file "vsx_convert_extracted_mask_bits_to_vect_030923.ll"
.section .rodata.cst16,"aM",@progbits,16
.p2align 4, 0x0 # -- Begin function ConvertExtractedMaskBitsToVect
.LCPI0_0:
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.text
.globl ConvertExtractedMaskBitsToVect
.p2align 4
.type ConvertExtractedMaskBitsToVect,@function
.section .opd,"aw",@progbits
ConvertExtractedMaskBitsToVect: # @ConvertExtractedMaskBitsToVect
.p2align 3, 0x0
.quad .Lfunc_begin0
.quad .TOC.@tocbase
.quad 0
.text
.Lfunc_begin0:
.cfi_startproc
# %bb.0:
addis 3, 2, .LCPI0_0@toc@ha
xxlxor 36, 36, 36
xxlxor 35, 35, 35
addi 3, 3, .LCPI0_0@toc@l
lxv 0, 0(3)
xxperm 36, 34, 0
xxsplti32dx 0, 0, 16909320
xxsplti32dx 0, 1, 270549120
xxland 34, 36, 0
vcmpequb 2, 2, 3
xxlnor 34, 34, 34
blr
.long 0
.quad 0
.Lfunc_end0:
.size ConvertExtractedMaskBitsToVect, .Lfunc_end0-.Lfunc_begin0
.cfi_endproc
# -- End function
.section ".note.GNU-stack","",@progbits
```
The indices for the ```xxperm``` operation in the assembly code generated above (```.LCPI0_0```) are incorrect.
In the above LLVM IR code, the contents of lane 7 of ```%a4``` should be in lane 1 of ```%a7``` on big-endian targets, and the contents of lane 7 of ```%a4``` should be in the lower 8 lanes of the shuffled vector instead of the upper 8 lanes of the shuffled vector.
The bug with the shuffle operation above is causing issues with some code that is compiled with clang for big-endian powerpc64 targets when optimizations are enabled, including issue https://github.com/llvm/llvm-project/issues/61275.
For comparison, here is some LLVM IR that generates correct code on big-endian POWER10:
```
target triple = "powerpc64-unknown-linux-gnu"
define dso_local <16 x i8> @ConvertExtractedMaskBitsToVect(<16 x i8> noundef %0) local_unnamed_addr #0 {
%a8 = shufflevector <16 x i8> %0, <16 x i8> zeroinitializer, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%a9 = and <16 x i8> %a8, <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128>
%a10 = icmp eq <16 x i8> %a9, <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128>
%a11 = sext <16 x i1> %a10 to <16 x i8>
ret <16 x i8> %a11
}
attributes #0 = { mustprogress nofree nosync nounwind willreturn uwtable
"frame-pointer"="none" "min-legal-vector-width"="128"
"target-cpu"="pwr10"
"target-features"="+altivec,+power10-vector,+power8-vector,+power9-vector,+vsx" }
```
Here is the assembly code that is generated when the above LLVR IR code is compiled with llc-17:
```
.text
.file "vsx_convert_extracted_mask_bits_to_vect_030923_2.ll"
.section .rodata.cst16,"aM",@progbits,16
.p2align 4, 0x0 # -- Begin function ConvertExtractedMaskBitsToVect
.LCPI0_0:
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 7 # 0x7
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.byte 16 # 0x10
.text
.globl ConvertExtractedMaskBitsToVect
.p2align 4
.type ConvertExtractedMaskBitsToVect,@function
.section .opd,"aw",@progbits
ConvertExtractedMaskBitsToVect: # @ConvertExtractedMaskBitsToVect
.p2align 3, 0x0
.quad .Lfunc_begin0
.quad .TOC.@tocbase
.quad 0
.text
.Lfunc_begin0:
.cfi_startproc
# %bb.0:
addis 3, 2, .LCPI0_0@toc@ha
xxlxor 36, 36, 36
xxlxor 35, 35, 35
addi 3, 3, .LCPI0_0@toc@l
lxv 0, 0(3)
xxperm 36, 34, 0
xxsplti32dx 0, 0, 16909320
xxsplti32dx 0, 1, 270549120
xxland 34, 36, 0
vcmpequb 2, 2, 3
xxlnor 34, 34, 34
blr
.long 0
.quad 0
.Lfunc_end0:
.size ConvertExtractedMaskBitsToVect, .Lfunc_end0-.Lfunc_begin0
.cfi_endproc
# -- End function
.section ".note.GNU-stack","",@progbits
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsWltv2zgW_jX0C2GBImXLevBD7TS7BTI7g6LTfTQo6cjmlCZVkoqdPPS3L0jJspQ4SXeDAbpBgkBSxHPj5dy-iFsrtgpgiWYrNLua8MbttFn-pXeqltw5O8l1ebf8JxjAwmKr94Bvbr7-hj99xoUuAbsdd3gLCgx3YLFQhTYGCteOaoVzsZ2CKgVXuNYHMHUxT7DjZgvOIvYBkStEPqA56X7Dn-0wdkbUEjBiVxhR2nNPG_VN6YOaSqGa43SrGkRpJydcS6iEAlxavZG64BIjto7n-IjFArGPGCVkrdUtGPfx6AwvHJS_cfttJZz9or9C4RBdjDmUblQJFUZ0RhDNcJC6aZTieyg3vCwNRpQRjNJVawH2pDwJpkOnRMIelHtoS5C4xmKe4HTIOwu893B0WCw6aU5jEc-HVPNAJZQFM1Sw8PLjuVdQa2G1CirieSv5pI8MJaVBUi5cwe1DGa0mp8e2D7kXgdvumqqScAuF0-bxRHnqNY9fh3UdvWY0kLO1YBTHwdbXP5DXPoynm4XpclVemOSim41YdOoXmHb3pLsvuns87x7YiWJ-IpnGtKd6pZSx6TFpj0yxrzF8v2B_9ovbH7dHzTtGb3x8Mj4mTx5TAxd8j8dxFzjSq2EE4c4ZkTc-orWO7UNQusL7xrra6K0Ba7HSlQHASts7VYQYcRCqxAchpQHXGIWbg-O5hE4ypZXhe5jWWigHxgctdoUoVVoBotTHuL1QUwlbLqetE00PonS7njIsKu3FtWFyWtRNT1EfTEwu0FTAXWPA9oSIrrh04hYKRNeIrkJ0jUmnd_Bu8fhVNnp1a4_B-n4Fx7G8vZ4SiNsB_uP3f3_8HBPMrYV9Lu8GiUTYPpeU-LADFRh4rm-hpRIWF3pfC-nHhdthKQscpxGJCEZ0IeXtHvcGxIh9iNMfiK4QXVFCGWFkQRLKaIroqqJFnMc8rWYJTOMfcKzjHwOiJMmiNCO9MJ_L_swb5RpMaUQSRLOn0hciWeTg6M5_VcKfggxRemuPm6LNPptTZig3e26_bXLh7MbpjV_cDWEkoyyScpDbsshC4YRW_tHokjseFdYFB0KU8t_Cvq5RQvwZ9eIQXZ8ShuepKZdi69mDh5EjwU_-IMrwdIpXsBUKV40KivELiTNoim7Wf3wiG3JeHpJF-Z3zKxByEMPkGJNLY8_-_C2Mb92a9Hm-jjG9wPf0yP8q8Q1aMnbzrdS5RCT7GS956I9nkXc1vCwjuPnJLS8GCF2XXVw4PI4LgeEFFexDmO_L5fKF-bAuvpzHvje89Pcbb_Qm92HlwuiX39cRSojTRc4tPBq_GGDHIodBp6jExjpufM4uTimR-cSf59GIlJelsDhYHaqQPoYFW1BCdrynPR7lURvMQt3SXx-OhjK7vw71tGrYRTWyp5TH27YAJYgumM82Zw01mP1JcxvJB4O2lk4wWh579jWO5xnJGH2GLFRpNCWzJItHdNKXua2WVuN58LbY1_C9ydslo-2sBpzKr0NytpKdT3kuzXmbpFbb8eaOt7vbYFDleHutuP8pT8EDAdMnDqA_KqDK80E5JcCPqsTP-RmiNFLaQfSPf_05tY4X3zpv89eLbnexQPqyAyxUKQqwuNIm1D09Ybvjg1Kk9iWSz8eiq5BGpdS5hGorJ9_Pnnj7E9cbQTPMfXF26tyjoVmfhhXYsPH3y-pHCq0cKGexrrDkCnDqnwbSZzw5G253upElzr26ljx-RJ6OSq4BfHACDeg6tF6v1O7Zpa9m8SKwBhn-ZdfFlrjrY4WyDnh5Gm7q-kWe6OHO5s22LVcHxINdbJfX17a8sUJtsbC2AduyBMxlVCKPK-BCcrUNZ-Y5qKWtpnXtxF7cB602bDso36GUoeFShWzKXj3eOVcHiIZeI3q9FW7X5FGh94he-zq7u01ro_8KfnbdWo3o9Tym6Wy0CNfaBLO5OcERu0uQ0gM06RksqWsi3jiC9FO4CnkMq9yD0UIJJ7gU977VfApgSU8Qxysfuib-9U_vQMuvYv870PL_DrT8lwDLzc3Xzz2ufwlomcbp3w95bOibAz1-6Q72zVjya-Eub9mad9DjHfR4Bz3eQY9XgB6TcsnKjGV8Ast4ni4SkhDGJrslTysGUMAiz5Ocl0B4RskiqxawyBhks4lYdv-kySijhLEIsrJgGaXJnFZpwRKUENhzISPfnUbabCehL13OYxbPJpLnIG343IFSBYe21_VGz64mZhk62rzZWn8shXX2LMUJJ2H5afyBw7mYqoze933sU989-C69b_-rE8JQGDiL8K3bFI4OVAll__FAibt_708aI5ev6cxZPPtPAAAA__8TuC21">