[llvm] [InstCombine] Implement vp.reverse elimination through binop/unop (PR #143963)

Fri Jun 13 07:35:00 PDT 2025

https://github.com/preames updated https://github.com/llvm/llvm-project/pull/143963

>From 268a6f19c4873640aaea8ea4867ae898a09a3274 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Thu, 12 Jun 2025 08:15:43 -0700
Subject: [PATCH 1/3] [InstCombine] Implement vp.reverse elimination through
 binop/unop

This simply copies the structure of the vector.reverse patterns from
just above, and reimplements them for the vp.reverse intrinsics
when the masks and EVLs exactly match.

Its unfortunate that we have three different ways to represent a
reverse (shuffle, vector.reverse, and vp.reverse) but I don't see
an obvious way to remove any them because the semantics are slightly
different.

This sigificantly improves vectorization in TSVC_2 s112 and s1112
loops when using EVL tail folding.
---
 .../InstCombine/InstCombineCalls.cpp          | 36 +++++++++++++++++++
 .../test/Transforms/InstCombine/vp-reverse.ll | 32 ++++++-----------
 2 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c169ab25b2106..ee7c9dbac2fa3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3582,6 +3582,42 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::experimental_vp_reverse: {
+    Value *BO0, *BO1, *X, *Y;
+    Value *Vec = II->getArgOperand(0);
+    Value *Mask = II->getArgOperand(1);
+    Value *EVL = II->getArgOperand(2);
+    auto m_VPReverse = [&](Value *&Vec) {
+      return m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+          m_Value(Vec), m_Specific(Mask), m_Specific(EVL));
+    };
+    if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
+      auto *OldBinOp = cast<BinaryOperator>(Vec);
+      if (match(BO0, m_VPReverse(X))) {
+        // rev(binop rev(X), rev(Y)) --> binop X, Y
+        if (match(BO1, m_VPReverse(Y)))
+          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
+                                             OldBinOp->getOpcode(), X, Y,
+                                             OldBinOp, OldBinOp->getName(),
+                                             II->getIterator()));
+        // rev(binop rev(X), BO1Splat) --> binop X, BO1Splat
+        if (isSplatValue(BO1))
+          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
+                                             OldBinOp->getOpcode(), X, BO1,
+                                             OldBinOp, OldBinOp->getName(),
+                                             II->getIterator()));
+      }
+    }
+    // rev(unop rev(X)) --> unop X
+    if (match(Vec, m_OneUse(m_UnOp(m_VPReverse(X))))) {
+      auto *OldUnOp = cast<UnaryOperator>(Vec);
+      auto *NewUnOp = UnaryOperator::CreateWithCopiedFlags(
+          OldUnOp->getOpcode(), X, OldUnOp, OldUnOp->getName(),
+          II->getIterator());
+      return replaceInstUsesWith(CI, NewUnOp);
+    }
+    break;
+  }
   case Intrinsic::vector_reduce_or:
   case Intrinsic::vector_reduce_and: {
     // Canonicalize logical or/and reductions:
diff --git a/llvm/test/Transforms/InstCombine/vp-reverse.ll b/llvm/test/Transforms/InstCombine/vp-reverse.ll
index 79e6c47bdf1b2..41b27be5f5248 100644
--- a/llvm/test/Transforms/InstCombine/vp-reverse.ll
+++ b/llvm/test/Transforms/InstCombine/vp-reverse.ll
@@ -3,11 +3,8 @@
 
 define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_elim(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[B:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -18,11 +15,8 @@ define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4
 
 define <vscale x 4 x i32> @binop_reverse_elim2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_elim2(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV:%.*]], [[B_REV:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> %m, i32 %evl)
   %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
@@ -63,10 +57,8 @@ define <vscale x 4 x i32> @binop_reverse_elim_diffevl(<vscale x 4 x i32> %a, <vs
 
 define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_splat_elim(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], splat (i32 22)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %add = add nsw <vscale x 4 x i32> %a.rev, splat (i32 22)
@@ -76,10 +68,8 @@ define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %
 
 define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_splat_elim2(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], splat (i32 22)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %add = add nsw <vscale x 4 x i32> splat (i32 22), %a.rev
@@ -89,10 +79,8 @@ define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32
 
 define <vscale x 4 x float> @unop_reverse_splat_elim(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 %evl) {
 ; CHECK-LABEL: @unop_reverse_splat_elim(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV]]
-; CHECK-NEXT:    [[OP_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[OP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[OP_REV]]
+; CHECK-NEXT:    [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OP]]
 ;
   %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %op = fneg <vscale x 4 x float> %a.rev

>From 225ff0bbe4eb6b211738d9c883c988c15e817c35 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 13 Jun 2025 07:20:26 -0700
Subject: [PATCH 2/3] Address review comment

---
 .../lib/Transforms/InstCombine/InstCombineCalls.cpp |  4 +++-
 llvm/test/Transforms/InstCombine/vp-reverse.ll      | 13 +++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index ee7c9dbac2fa3..e6aaa01a27283 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3586,10 +3586,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Value *BO0, *BO1, *X, *Y;
     Value *Vec = II->getArgOperand(0);
     Value *Mask = II->getArgOperand(1);
+    if (!match(Mask, m_AllOnes()))
+      break;
     Value *EVL = II->getArgOperand(2);
     auto m_VPReverse = [&](Value *&Vec) {
       return m_Intrinsic<Intrinsic::experimental_vp_reverse>(
-          m_Value(Vec), m_Specific(Mask), m_Specific(EVL));
+          m_Value(Vec), m_AllOnes(), m_Specific(EVL));
     };
     if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
       auto *OldBinOp = cast<BinaryOperator>(Vec);
diff --git a/llvm/test/Transforms/InstCombine/vp-reverse.ll b/llvm/test/Transforms/InstCombine/vp-reverse.ll
index 41b27be5f5248..28645c5525e17 100644
--- a/llvm/test/Transforms/InstCombine/vp-reverse.ll
+++ b/llvm/test/Transforms/InstCombine/vp-reverse.ll
@@ -13,10 +13,15 @@ define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4
   ret <vscale x 4 x i32> %add.rev
 }
 
-define <vscale x 4 x i32> @binop_reverse_elim2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
-; CHECK-LABEL: @binop_reverse_elim2(
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV:%.*]], [[B_REV:%.*]]
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+; Negative test - the mask needs to be reversed between the inner and
+; the outer to be correct.
+define <vscale x 4 x i32> @binop_reverse_elim_samemask(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim_samemask(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> %m, i32 %evl)
   %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)

>From 8ba4d6c45fea0f939120079daf317e61741f50c2 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Fri, 13 Jun 2025 07:30:35 -0700
Subject: [PATCH 3/3] Add handling for rev(binop BO0Splat, rev(Y)) --> binop
 BO0Splat, Y

---
 .../InstCombine/InstCombineCalls.cpp          |  6 ++++
 .../test/Transforms/InstCombine/vp-reverse.ll | 30 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e6aaa01a27283..9648306226447 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3609,6 +3609,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
                                              OldBinOp, OldBinOp->getName(),
                                              II->getIterator()));
       }
+      // rev(binop BO0Splat, rev(Y)) --> binop BO0Splat, Y
+      if (match(BO1, m_VPReverse(Y)) && isSplatValue(BO0))
+        return replaceInstUsesWith(CI,
+                                   BinaryOperator::CreateWithCopiedFlags(
+                                       OldBinOp->getOpcode(), BO0, Y, OldBinOp,
+                                       OldBinOp->getName(), II->getIterator()));
     }
     // rev(unop rev(X)) --> unop X
     if (match(Vec, m_OneUse(m_UnOp(m_VPReverse(X))))) {
diff --git a/llvm/test/Transforms/InstCombine/vp-reverse.ll b/llvm/test/Transforms/InstCombine/vp-reverse.ll
index 28645c5525e17..ad7ab33afce3c 100644
--- a/llvm/test/Transforms/InstCombine/vp-reverse.ll
+++ b/llvm/test/Transforms/InstCombine/vp-reverse.ll
@@ -82,6 +82,36 @@ define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32
   ret <vscale x 4 x i32> %add.rev
 }
 
+define <vscale x 4 x i32> @binop_reverse_splat_elim3(<vscale x 4 x i32> %a, i32 %b, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_splat_elim3(
+; CHECK-NEXT:    [[B_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
+; CHECK-NEXT:    [[B_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[B_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[B_VEC]], [[A_REV:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %b.ins = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %b.vec = shufflevector <vscale x 4 x i32> %b.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %b.vec, %a.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_splat_elim4(<vscale x 4 x i32> %a, i32 %b, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_splat_elim4(
+; CHECK-NEXT:    [[B_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
+; CHECK-NEXT:    [[B_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[B_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B_VEC]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
+;
+  %b.ins = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %b.vec = shufflevector <vscale x 4 x i32> %b.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev,  %b.vec
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
 define <vscale x 4 x float> @unop_reverse_splat_elim(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 %evl) {
 ; CHECK-LABEL: @unop_reverse_splat_elim(
 ; CHECK-NEXT:    [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV:%.*]]