[llvm] [PowerPC] Fix vector_shuffle combines when inputs are scalar_to_vector of differing types. (PR #80784)

Mon Nov 4 12:03:51 PST 2024

================
@@ -15686,6 +15690,50 @@ static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG,
                      OrigSToV.getOperand(0));
 }
 
+static bool isShuffleMaskInRange(const SmallVectorImpl<int> &ShuffV,
+                                 int HalfVec, int LHSLastElementDefined,
+                                 int RHSLastElementDefined) {
+  for (int I : seq<int>(0, ShuffV.size())) {
+    int Index = ShuffV[I];
+    if (Index < 0) // Skip explicitly undefined mask indices.
+      continue;
+    // Handle first input vector of the vector_shuffle.
+    if ((LHSLastElementDefined >= 0) && (Index < HalfVec) &&
+        (Index > LHSLastElementDefined))
+      return false;
+    // Handle second input vector of the vector_shuffle.
+    if ((RHSLastElementDefined >= 0) &&
+        (Index > HalfVec + RHSLastElementDefined))
+      return false;
+  }
+  return true;
+}
+
+static SDValue generateSToVPermutedForVecShuffle(
+    int ScalarSize, uint64_t ShuffleEltWidth, unsigned &NumValidElts,
+    int FirstElt, int &LastElt, SDValue VecShuffOperand, SDValue SToVNode,
+    SelectionDAG &DAG, const PPCSubtarget &Subtarget) {
+  EVT VecShuffOperandType = VecShuffOperand.getValueType();
+  // Set up the values for the shuffle vector fixup.
+  NumValidElts = ScalarSize / VecShuffOperandType.getScalarSizeInBits();
+  // The last element depends on if the input comes from the LHS or RHS.
+  //
+  // For example:
+  // (shuff (s_to_v i32), (bitcast (s_to_v i64), v4i32), ...)
+  //
+  // For the LHS: The last element that comes from the LHS is actually 0, not 3
+  // because elements 1 and higher of a scalar_to_vector are undefined.
+  // For the RHS: The last element that comes from the RHS is actually 5, not 7
+  // because elements 1 and higher of a scalar_to_vector are undefined.
+  // It is also not 4 because the original scalar_to_vector is wider and
+  // actually contains two i32 elements.
+  LastElt = ScalarSize / (ShuffleEltWidth + 1) + FirstElt;
----------------
amy-kwan wrote:

I think we can discuss this offline but I am abit unclear of the suggestion. This would not work if we have the test case that is highlighted in the comment above where we expect the last element of the LHS to be 0, and the last element of the RHS to be 5:
```
define void @test_v4i32_v2i64(ptr %a) {
entry:
  %0 = load <2 x i16>, ptr undef, align 8
  %tmp0_1 = bitcast <2 x i16> %0 to i32
  %tmp0_2 = insertelement <4 x i32> undef, i32 %tmp0_1, i32 0
  %1 = load <2 x i16>, ptr %a, align 4
  %tmp1_1 = bitcast <2 x i16> %1 to i32
  %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0
  %2 = shufflevector <4 x i32> %tmp1_2, <4 x i32> %tmp0_2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
  store <4 x i32> %2, ptr undef, align 4
  ret void
}
```
Also, the addition of `+ 1` to the `ShuffleEltWidth` makes it so the division rounds down and we're able to get the correct value when we add `FirstElt` to it.

https://github.com/llvm/llvm-project/pull/80784