[llvm] [RISCV] Fix matching bug in VLA shuffle lowering (PR #134750)

Mon Apr 7 15:49:50 PDT 2025

https://github.com/preames created https://github.com/llvm/llvm-project/pull/134750

Fix https://github.com/llvm/llvm-project/issues/134126.

The matching code was previous written as if we were mutating the indices to replace undef elements with preferred values, but the actual lowering code just took a prefix of the index vector.  This resulted in us using undef indices for lanes which should have been defined, resulting in incorrect codegen.

Longer term, we probably should rewrite the mask, but this seemed like an easier tactical fix.

>From c02313a28c4fcdb596696bb552b79ab5c95a1d1e Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Mon, 7 Apr 2025 14:47:51 -0700
Subject: [PATCH] [RISCV] Fix matching bug in VLA shuffle lowering

Fix https://github.com/llvm/llvm-project/issues/134126.

The matching code was previous written as if we were mutating the
indices to replace undef elements with preferred values, but the
actual lowering code just took a prefix of the index vector.  This
resulted in us using undef indices for lanes which should have
been defined, resulting in incorrect codegen.

Longer term, we probably should rewrite the mask, but this seemed
like an easier tactical fix.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 12 +++----
 .../RISCV/rvv/fixed-vectors-shuffle-int.ll    | 31 ++++++++++++-------
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2a1dd2b2def17..f92f451fcf570 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5399,7 +5399,9 @@ static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
 /// Is this mask local (i.e. elements only move within their local span), and
 /// repeating (that is, the same rearrangement is being done within each span)?
 static bool isLocalRepeatingShuffle(ArrayRef<int> Mask, int Span) {
-  SmallVector<int> LowSpan(Span, -1);
+  // Require a prefix from the original mask until the consumer code
+  // is adjusted to rewrite the mask instead of just taking a prefix.
+  SmallVector<int> LowSpan(Mask.take_front(Span));
   for (auto [I, M] : enumerate(Mask)) {
     if (M == -1)
       continue;
@@ -5407,8 +5409,6 @@ static bool isLocalRepeatingShuffle(ArrayRef<int> Mask, int Span) {
       return false;
     int SpanIdx = I % Span;
     int Expected = M % Span;
-    if (LowSpan[SpanIdx] == -1)
-      LowSpan[SpanIdx] = Expected;
     if (LowSpan[SpanIdx] != Expected)
       return false;
   }
@@ -5424,13 +5424,13 @@ static bool isLowSourceShuffle(ArrayRef<int> Mask, int Span) {
 /// span, and then repeats that same result across all remaining spans.  Note
 /// that this doesn't check if all the inputs come from a single span!
 static bool isSpanSplatShuffle(ArrayRef<int> Mask, int Span) {
-  SmallVector<int> LowSpan(Span, -1);
+  // Require a prefix from the original mask until the consumer code
+  // is adjusted to rewrite the mask instead of just taking a prefix.
+  SmallVector<int> LowSpan(Mask.take_front(Span));
   for (auto [I, M] : enumerate(Mask)) {
     if (M == -1)
       continue;
     int SpanIdx = I % Span;
-    if (LowSpan[SpanIdx] == -1)
-      LowSpan[SpanIdx] = M;
     if (LowSpan[SpanIdx] != M)
       return false;
   }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
index 65f78dcfb4bce..2020f3dc6bad2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll
@@ -1372,36 +1372,45 @@ define <8 x i64> @shuffle_v8i164_span_splat(<8 x i64> %a) nounwind {
   ret <8 x i64> %res
 }
 
-; FIXME: Doing this as a span spat requires rewriting the undef elements in
-; the mask not just using a prefix of the mask.
+; Doing this as a span splat requires rewriting the undef elements in the mask
+; not just using a prefix of the mask.
 define <8 x i64> @shuffle_v8i64_span_splat_neg(<8 x i64> %a) nounwind {
 ; CHECK-LABEL: shuffle_v8i64_span_splat_neg:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 1
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v10, v9, a0
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v13, v8, v10
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v10, v10, a0
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v9
-; CHECK-NEXT:    vmv.v.v v13, v12
-; CHECK-NEXT:    vmv.v.v v14, v12
-; CHECK-NEXT:    vmv.v.v v15, v12
+; CHECK-NEXT:    vrgatherei16.vv v14, v8, v10
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v10, a0
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v15, v8, v9
 ; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
   ret <8 x i64> %res
 }
 
-; FIXME: A locally repeating shuffle needs to use a mask prefix
+; Doing this as a locally repeating shuffle requires rewriting the undef
+; elements in the mask not just using a prefix of the mask.
 define <8 x i32> @shuffle_v8i32_locally_repeating_neg(<8 x i32> %a) {
 ; CHECK-LABEL: shuffle_v8i32_locally_repeating_neg:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a0, %hi(.LCPI87_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI87_0)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v11, v9, v12
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 poison, i32 poison, i32 5, i32 4, i32 6, i32 6>
   ret <8 x i32> %res