[llvm] [RISCV][DAGCombiner] Fix potential missed combine in VL->VW extension (PR #168026)

Mon Nov 17 01:59:13 PST 2025

https://github.com/OMG-link updated https://github.com/llvm/llvm-project/pull/168026

>From aefd5bdadab5cd4c8a83d1952592b612b195bc24 Mon Sep 17 00:00:00 2001
From: Kai Lin <omg_link at qq.com>
Date: Fri, 14 Nov 2025 18:12:31 +0800
Subject: [PATCH 1/2] [RVV] Add test for missed VWMACC combine

Add a minimal reproducer for consecutive vwmacc-like operations
to illustrate that the previous DAG combine logic may miss combining
mul+add chains into a single vwmacc.vx instruction.
---
 .../CodeGen/RISCV/rvv/combine-vl-vw-macc.ll   | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll

diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
new file mode 100644
index 0000000000000..663359bb5b4da
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define void @matmul_min(ptr %vptr, ptr %scalars, ptr %acc0_ptr, ptr %acc1_ptr) {
+; CHECK-LABEL: matmul_min:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a4, 64
+; CHECK-NEXT:    li a5, 32
+; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a2)
+; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v20, (a0)
+; CHECK-NEXT:    lb a0, 0(a1)
+; CHECK-NEXT:    lb a1, 1(a1)
+; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a3)
+; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
+; CHECK-NEXT:    vwmacc.vx v8, a0, v20
+; CHECK-NEXT:    vwmul.vx v16, v20, a1
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vadd.vv v12, v16, v12
+; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a2)
+; CHECK-NEXT:    vse8.v v12, (a3)
+; CHECK-NEXT:    ret
+entry:
+  %acc0 = load <32 x i16>, ptr %acc0_ptr, align 1
+  %acc1 = load <32 x i16>, ptr %acc1_ptr, align 1
+
+  %v8 = load <32 x i8>, ptr %vptr, align 1
+  %v16 = sext <32 x i8> %v8 to <32 x i16>
+
+  %s0_ptr = getelementptr i8, ptr %scalars, i32 0
+  %s0_i8 = load i8, ptr %s0_ptr, align 1
+  %s0_i16 = sext i8 %s0_i8 to i16
+  %tmp0 = insertelement <32 x i16> poison, i16 %s0_i16, i32 0
+  %splat0 = shufflevector <32 x i16> %tmp0, <32 x i16> poison, <32 x i32> zeroinitializer
+  %mul0 = mul <32 x i16> %splat0, %v16
+  %add0 = add <32 x i16> %mul0, %acc0
+
+  %s1_ptr = getelementptr i8, ptr %scalars, i32 1
+  %s1_i8 = load i8, ptr %s1_ptr, align 1
+  %s1_i16 = sext i8 %s1_i8 to i16
+  %tmp1 = insertelement <32 x i16> poison, i16 %s1_i16, i32 0
+  %splat1 = shufflevector <32 x i16> %tmp1, <32 x i16> poison, <32 x i32> zeroinitializer
+  %mul1 = mul <32 x i16> %splat1, %v16
+  %add1 = add <32 x i16> %mul1, %acc1
+
+  store <32 x i16> %add0, ptr %acc0_ptr, align 1
+  store <32 x i16> %add1, ptr %acc1_ptr, align 1
+
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}

>From 16d98bcce9449c2f8c2af217b51c2485b8697305 Mon Sep 17 00:00:00 2001
From: Kai Lin <omg_link at qq.com>
Date: Sat, 15 Nov 2025 16:30:52 +0800
Subject: [PATCH 2/2] [RISCV][DAGCombiner] Fix missed combines in
 combineOp_VLToVWOp_VL

The previous implementation of combineOp_VLToVWOp_VL manually replaced old
nodes with newly created widened nodes, but only added the new node itself to
the DAGCombiner worklist. Since the users of the new node were not added,
some combine opportunities could be missed when external DAGCombiner passes
expected those users to be reconsidered.

This patch replaces the custom replacement logic with a call to
DCI.CombineTo(), which performs node replacement in a way consistent with
DAGCombiner::Run:
- Replace all uses of the old node.
- Add the new node and its users to the worklist.
- Clean up unused nodes when appropriate.

Using CombineTo ensures that combineOp_VLToVWOp_VL behaves consistently with
the standard DAGCombiner update model, avoiding discrepancies between the
private worklist inside this routine and the global worklist managed by the
combiner.

This resolves missed combine cases involving VL -> VW operator widening.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp       |  3 +--
 llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll | 11 ++++-------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 38cce26e44af4..8ba1215561dc3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18300,8 +18300,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
     }
   }
   for (std::pair<SDValue, SDValue> OldNewValues : ValuesToReplace) {
-    DAG.ReplaceAllUsesOfValueWith(OldNewValues.first, OldNewValues.second);
-    DCI.AddToWorklist(OldNewValues.second.getNode());
+    DCI.CombineTo(OldNewValues.first.getNode(), OldNewValues.second);
   }
   return InputRootReplacement;
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
index 663359bb5b4da..943d8d2409ffd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
@@ -7,19 +7,16 @@ define void @matmul_min(ptr %vptr, ptr %scalars, ptr %acc0_ptr, ptr %acc1_ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    li a4, 64
 ; CHECK-NEXT:    li a5, 32
-; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a2)
 ; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v20, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    lb a0, 0(a1)
 ; CHECK-NEXT:    lb a1, 1(a1)
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a2)
 ; CHECK-NEXT:    vle8.v v12, (a3)
 ; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
-; CHECK-NEXT:    vwmacc.vx v8, a0, v20
-; CHECK-NEXT:    vwmul.vx v16, v20, a1
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vadd.vv v12, v16, v12
+; CHECK-NEXT:    vwmacc.vx v8, a0, v16
+; CHECK-NEXT:    vwmacc.vx v12, a1, v16
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a2)
 ; CHECK-NEXT:    vse8.v v12, (a3)