[llvm] [RISCV][DAGCombiner] Fix potential missed combine in VL->VW extension (PR #168026)

Sat Nov 15 21:53:24 PST 2025

https://github.com/OMG-link updated https://github.com/llvm/llvm-project/pull/168026

>From ee4ed0bee96051a604fa4a5df91bf3571a2976de Mon Sep 17 00:00:00 2001
From: Kai Lin <omg_link at qq.com>
Date: Fri, 14 Nov 2025 18:12:31 +0800
Subject: [PATCH 1/2] [RVV] Add test for missed VWMACC combine

Add a minimal reproducer for consecutive vwmacc-like operations
to illustrate that the previous DAG combine logic may miss combining
mul+add chains into a single vwmacc.vx instruction.
---
 .../CodeGen/RISCV/rvv/combine-vl-vw-macc.ll   | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll

diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
new file mode 100644
index 0000000000000..f4615df5d310e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define void @matmul_min(ptr %vptr, ptr %scalars, ptr %acc0_ptr, ptr %acc1_ptr) {
+; CHECK-LABEL: matmul_min:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a4, 64
+; CHECK-NEXT:    li a5, 32
+; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a2)
+; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v20, (a0)
+; CHECK-NEXT:    lb a0, 0(a1)
+; CHECK-NEXT:    lb a1, 1(a1)
+; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a3)
+; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
+; CHECK-NEXT:    vwmacc.vx v8, a0, v20
+; CHECK-NEXT:    vwmul.vx v16, v20, a1
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vadd.vv v12, v16, v12
+; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a2)
+; CHECK-NEXT:    vse8.v v12, (a3)
+; CHECK-NEXT:    ret
+entry:
+  %acc0 = load <32 x i16>, ptr %acc0_ptr, align 1
+  %acc1 = load <32 x i16>, ptr %acc1_ptr, align 1
+
+  %v8 = load <32 x i8>, ptr %vptr, align 1
+  %v16 = sext <32 x i8> %v8 to <32 x i16>
+
+  %s0_ptr = getelementptr i8, ptr %scalars, i32 0
+  %s0_i8 = load i8, ptr %s0_ptr, align 1
+  %s0_i16 = sext i8 %s0_i8 to i16
+  %tmp0 = insertelement <32 x i16> undef, i16 %s0_i16, i32 0
+  %splat0 = shufflevector <32 x i16> %tmp0, <32 x i16> undef, <32 x i32> zeroinitializer
+  %mul0 = mul <32 x i16> %splat0, %v16
+  %add0 = add <32 x i16> %mul0, %acc0
+
+  %s1_ptr = getelementptr i8, ptr %scalars, i32 1
+  %s1_i8 = load i8, ptr %s1_ptr, align 1
+  %s1_i16 = sext i8 %s1_i8 to i16
+  %tmp1 = insertelement <32 x i16> undef, i16 %s1_i16, i32 0
+  %splat1 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> zeroinitializer
+  %mul1 = mul <32 x i16> %splat1, %v16
+  %add1 = add <32 x i16> %mul1, %acc1
+
+  store <32 x i16> %add0, ptr %acc0_ptr, align 1
+  store <32 x i16> %add1, ptr %acc1_ptr, align 1
+
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}

>From 590531af4b196673f5a0e4060ff6bfdcf56cc875 Mon Sep 17 00:00:00 2001
From: Kai Lin <omg_link at qq.com>
Date: Sat, 15 Nov 2025 16:30:52 +0800
Subject: [PATCH 2/2] [RISCV][DAGCombiner] Fix missed combines in
 combineOp_VLToVWOp_VL

The previous implementation of combineOp_VLToVWOp_VL manually replaced old
nodes with newly created widened nodes, but only added the new node itself to
the DAGCombiner worklist. Since the users of the new node were not added,
some combine opportunities could be missed when external DAGCombiner passes
expected those users to be reconsidered.

This patch replaces the custom replacement logic with a call to
DCI.CombineTo(), which performs node replacement in a way consistent with
DAGCombiner::Run:
- Replace all uses of the old node.
- Add the new node and its users to the worklist.
- Clean up unused nodes when appropriate.

Using CombineTo ensures that combineOp_VLToVWOp_VL behaves consistently with
the standard DAGCombiner update model, avoiding discrepancies between the
private worklist inside this routine and the global worklist managed by the
combiner.

This resolves missed combine cases involving VL -> VW operator widening.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp       |  3 +--
 llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll | 11 ++++-------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 38cce26e44af4..8ba1215561dc3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18300,8 +18300,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
     }
   }
   for (std::pair<SDValue, SDValue> OldNewValues : ValuesToReplace) {
-    DAG.ReplaceAllUsesOfValueWith(OldNewValues.first, OldNewValues.second);
-    DCI.AddToWorklist(OldNewValues.second.getNode());
+    DCI.CombineTo(OldNewValues.first.getNode(), OldNewValues.second);
   }
   return InputRootReplacement;
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
index f4615df5d310e..8dd8e07dbb6ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
@@ -7,19 +7,16 @@ define void @matmul_min(ptr %vptr, ptr %scalars, ptr %acc0_ptr, ptr %acc1_ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    li a4, 64
 ; CHECK-NEXT:    li a5, 32
-; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a2)
 ; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v20, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    lb a0, 0(a1)
 ; CHECK-NEXT:    lb a1, 1(a1)
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a2)
 ; CHECK-NEXT:    vle8.v v12, (a3)
 ; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
-; CHECK-NEXT:    vwmacc.vx v8, a0, v20
-; CHECK-NEXT:    vwmul.vx v16, v20, a1
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vadd.vv v12, v16, v12
+; CHECK-NEXT:    vwmacc.vx v8, a0, v16
+; CHECK-NEXT:    vwmacc.vx v12, a1, v16
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a2)
 ; CHECK-NEXT:    vse8.v v12, (a3)