[llvm] [AMDGPU] Enable vectorization of i8 values. (PR #134934)

Fri Apr 18 11:37:17 PDT 2025

================
@@ -126,24 +126,24 @@ define amdgpu_kernel void @add_i16() #0 {
 define amdgpu_kernel void @add_i8() #0 {
 ; ALL-LABEL: 'add_i8'
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = add i8 undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = add <2 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = add <3 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = add <4 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = add <5 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v6i8 = add <6 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i8 = add <32 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v33i8 = add <33 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = add <2 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = add <3 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = add <4 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = add <5 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6i8 = add <6 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8 = add <32 x i8> undef, undef
----------------
doru1004 wrote:

Are you expecting the following code to be worth vectorizing from a cost model perspective?
```
  %el0 = extractelement <16 x i8> %invec, i64  0
   %el1 = extractelement <16 x i8> %invec, i64  1
   %el2 = extractelement <16 x i8> %invec, i64  2
   %el3 = extractelement <16 x i8> %invec, i64  3
   %mul0 = mul i8 %el0, 1
   %mul1 = mul i8 %el1, 1
   %mul2 = mul i8 %el2, 1
   %mul3 = mul i8 %el3, 1
   %add0 = add i8 %mul0, 1
   %add1 = add i8 %mul1, 1
   %add2 = add i8 %mul2, 1
   %add3 = add i8 %mul3, 1
   %vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0
   %vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1
   %vecins2 = insertelement <16 x i8> %vecins1, i8 %add2, i64 2
   %vecins3 = insertelement <16 x i8> %vecins2, i8 %add3, i64 3
   store <16 x i8> %vecins3, ptr %out
   ret void
```

Because with the patch it can currently get transformed to this:

```
  %0 = shufflevector <16 x i8> %invec, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %1 = mul <4 x i8> %0, splat (i8 1)
  %2 = add <4 x i8> %1, splat (i8 1)
  %3 = shufflevector <4 x i8> %2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  store <16 x i8> %3, ptr %out, align 16
  ret void
```

So there is no benefit to having this vectorized code?

https://github.com/llvm/llvm-project/pull/134934