[llvm] [RISCV][DAGCombiner] Fix potential missed combine in VL->VW extension (PR #168026)

Sat Nov 15 01:22:26 PST 2025

https://github.com/OMG-link updated https://github.com/llvm/llvm-project/pull/168026

>From f60ec38b138e3eb2608a57df8f64d14ff1458663 Mon Sep 17 00:00:00 2001
From: Kai Lin <omg_link at qq.com>
Date: Fri, 14 Nov 2025 18:12:31 +0800
Subject: [PATCH 1/2] [RVV] Add test for missed VWMACC combine

Add a minimal reproducer for consecutive vwmacc-like operations
to illustrate that the previous DAG combine logic may miss combining
mul+add chains into a single vwmacc.vx instruction.
---
 .../CodeGen/RISCV/rvv/combine-vl-vw-macc.ll   | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll

diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
new file mode 100644
index 0000000000000..2bbaf73e6e0dc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define void @matmul_min(<32 x i8>* %vptr, i8* %scalars, <32 x i16>* %acc0_ptr, <32 x i16>* %acc1_ptr) {
+; CHECK-LABEL: matmul_min:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a4, 64
+; CHECK-NEXT:    li a5, 32
+; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a2)
+; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v20, (a0)
+; CHECK-NEXT:    lb a0, 0(a1)
+; CHECK-NEXT:    lb a1, 1(a1)
+; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a3)
+; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
+; CHECK-NEXT:    vwmacc.vx v8, a0, v20
+; CHECK-NEXT:    vwmul.vx v16, v20, a1
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vadd.vv v12, v16, v12
+; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a2)
+; CHECK-NEXT:    vse8.v v12, (a3)
+; CHECK-NEXT:    ret
+entry:
+  %acc0 = load <32 x i16>, <32 x i16>* %acc0_ptr, align 1
+  %acc1 = load <32 x i16>, <32 x i16>* %acc1_ptr, align 1
+
+  %v8 = load <32 x i8>, <32 x i8>* %vptr, align 1
+  %v16 = sext <32 x i8> %v8 to <32 x i16>
+
+  %s0_ptr = getelementptr i8, i8* %scalars, i32 0
+  %s0_i8 = load i8, i8* %s0_ptr, align 1
+  %s0_i16 = sext i8 %s0_i8 to i16
+  %tmp0 = insertelement <32 x i16> undef, i16 %s0_i16, i32 0
+  %splat0 = shufflevector <32 x i16> %tmp0, <32 x i16> undef, <32 x i32> zeroinitializer
+  %mul0 = mul <32 x i16> %splat0, %v16
+  %add0 = add <32 x i16> %mul0, %acc0
+
+  %s1_ptr = getelementptr i8, i8* %scalars, i32 1
+  %s1_i8 = load i8, i8* %s1_ptr, align 1
+  %s1_i16 = sext i8 %s1_i8 to i16
+  %tmp1 = insertelement <32 x i16> undef, i16 %s1_i16, i32 0
+  %splat1 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> zeroinitializer
+  %mul1 = mul <32 x i16> %splat1, %v16
+  %add1 = add <32 x i16> %mul1, %acc1
+
+  store <32 x i16> %add0, <32 x i16>* %acc0_ptr, align 1
+  store <32 x i16> %add1, <32 x i16>* %acc1_ptr, align 1
+
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}

>From 5147468e127ff6e2f8879993287f1ff91c075bee Mon Sep 17 00:00:00 2001
From: Kai Lin <omg_link at qq.com>
Date: Sat, 15 Nov 2025 16:30:52 +0800
Subject: [PATCH 2/2] [RISCV][DAGCombiner] Fix missed combines in
 combineOp_VLToVWOp_VL

The previous implementation of combineOp_VLToVWOp_VL manually replaced old
nodes with newly created widened nodes, but only added the new node itself to
the DAGCombiner worklist. Since the users of the new node were not added,
some combine opportunities could be missed when external DAGCombiner passes
expected those users to be reconsidered.

This patch replaces the custom replacement logic with a call to
DCI.CombineTo(), which performs node replacement in a way consistent with
DAGCombiner::Run:
- Replace all uses of the old node.
- Add the new node and its users to the worklist.
- Clean up unused nodes when appropriate.

Using CombineTo ensures that combineOp_VLToVWOp_VL behaves consistently with
the standard DAGCombiner update model, avoiding discrepancies between the
private worklist inside this routine and the global worklist managed by the
combiner.

This resolves missed combine cases involving VL -> VW operator widening.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp       |  3 +--
 llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll | 11 ++++-------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 38cce26e44af4..8ba1215561dc3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18300,8 +18300,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
     }
   }
   for (std::pair<SDValue, SDValue> OldNewValues : ValuesToReplace) {
-    DAG.ReplaceAllUsesOfValueWith(OldNewValues.first, OldNewValues.second);
-    DCI.AddToWorklist(OldNewValues.second.getNode());
+    DCI.CombineTo(OldNewValues.first.getNode(), OldNewValues.second);
   }
   return InputRootReplacement;
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
index 2bbaf73e6e0dc..6c179e4f1472c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-vl-vw-macc.ll
@@ -7,19 +7,16 @@ define void @matmul_min(<32 x i8>* %vptr, i8* %scalars, <32 x i16>* %acc0_ptr, <
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    li a4, 64
 ; CHECK-NEXT:    li a5, 32
-; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a2)
 ; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v20, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    lb a0, 0(a1)
 ; CHECK-NEXT:    lb a1, 1(a1)
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a2)
 ; CHECK-NEXT:    vle8.v v12, (a3)
 ; CHECK-NEXT:    vsetvli zero, a5, e8, m2, ta, ma
-; CHECK-NEXT:    vwmacc.vx v8, a0, v20
-; CHECK-NEXT:    vwmul.vx v16, v20, a1
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vadd.vv v12, v16, v12
+; CHECK-NEXT:    vwmacc.vx v8, a0, v16
+; CHECK-NEXT:    vwmacc.vx v12, a1, v16
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m4, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a2)
 ; CHECK-NEXT:    vse8.v v12, (a3)