[llvm] [Matrix] Propagate shape information through (f)abs insts (PR #141704)

Wed May 28 15:46:02 PDT 2025

https://github.com/jroelofs updated https://github.com/llvm/llvm-project/pull/141704

>From 0b5ffd24c1a06481dbd51804194a0440c1d8bc72 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Wed, 28 May 2025 15:44:39 -0700
Subject: [PATCH] [Matrix] Propagate shape information through (f)abs insts

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 57 ++++++++++++++++++-
 .../Transforms/LowerMatrixIntrinsics/binop.ll | 43 ++++++++++++++
 2 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 756a72e6d97bc..a58180c46be4b 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -232,6 +232,15 @@ static bool isUniformShape(Value *V) {
   if (I->isBinaryOp())
     return true;
 
+  if (auto *II = dyn_cast<IntrinsicInst>(V))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::abs:
+    case Intrinsic::fabs:
+      return true;
+    default:
+      return false;
+    }
+
   switch (I->getOpcode()) {
   case Instruction::FNeg:
     return true;
@@ -621,7 +630,7 @@ class LowerMatrixIntrinsics {
       case Intrinsic::matrix_column_major_store:
         return true;
       default:
-        return false;
+        return isUniformShape(II);
       }
     return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
   }
@@ -1127,6 +1136,9 @@ class LowerMatrixIntrinsics {
     case Intrinsic::matrix_column_major_store:
       LowerColumnMajorStore(Inst);
       break;
+    case Intrinsic::abs:
+    case Intrinsic::fabs:
+      return VisitUniformIntrinsic(cast<IntrinsicInst>(Inst));
     default:
       return false;
     }
@@ -2198,6 +2210,49 @@ class LowerMatrixIntrinsics {
     return true;
   }
 
+  /// Lower uniform shape intrinsics, if shape information is available.
+  bool VisitUniformIntrinsic(IntrinsicInst *Inst) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    IRBuilder<> Builder(Inst);
+    ShapeInfo &Shape = I->second;
+
+    MatrixTy Result;
+
+    switch (Inst->getIntrinsicID()) {
+    case Intrinsic::abs:
+    case Intrinsic::fabs: {
+      Value *Op = Inst->getOperand(0);
+
+      MatrixTy M = getMatrix(Op, Shape, Builder);
+
+      Builder.setFastMathFlags(getFastMathFlags(Inst));
+
+      for (unsigned I = 0; I < Shape.getNumVectors(); ++I)
+        switch (Inst->getIntrinsicID()) {
+        case Intrinsic::abs:
+          Result.addVector(Builder.CreateBinaryIntrinsic(
+              Intrinsic::abs, M.getVector(I), Inst->getOperand(1)));
+          break;
+        case Intrinsic::fabs:
+          Result.addVector(Builder.CreateUnaryIntrinsic(Inst->getIntrinsicID(),
+                                                        M.getVector(I)));
+          break;
+        }
+
+      finalizeLowering(Inst,
+                       Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                               Result.getNumVectors()),
+                       Builder);
+      return true;
+    }
+    default:
+      llvm_unreachable("unexpected intrinsic");
+    }
+  }
+
   /// Helper to linearize a matrix expression tree into a string. Currently
   /// matrix expressions are linarized by starting at an expression leaf and
   /// linearizing bottom up.
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
index 9160ced2715aa..48974a951c3bb 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll
@@ -432,3 +432,46 @@ define void @xor_2x2(ptr %lhs, ptr %rhs, ptr %out) {
   store <4 x i32> %optt, ptr %out
   ret void
 }
+
+define void @fabs_2x2f64(ptr %in, ptr %out) {
+; CHECK-LABEL: @fabs_2x2f64(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[COL_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[COL_LOAD1]])
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 16
+; CHECK-NEXT:    ret void
+;
+  %load = load <4 x double>, ptr %in
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %load)
+  %fabst  = call <4 x double> @llvm.matrix.transpose(<4 x double> %fabs, i32 2, i32 2)
+  %fabstt = call <4 x double> @llvm.matrix.transpose(<4 x double> %fabst, i32 2, i32 2)
+  store <4 x double> %fabstt, ptr %out
+  ret void
+}
+
+define void @fabs_2x2i32(ptr %in, ptr %out) {
+; CHECK-LABEL: @fabs_2x2i32(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[COL_LOAD]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[COL_LOAD1]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[TMP1]], i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[TMP2]], i1 true)
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %load = load <4 x i32>, ptr %in
+  %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %load, i1 false)
+  %abst  = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %abs, i32 2, i32 2)
+  %abstt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %abst, i32 2, i32 2)
+  %absabstt = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %abstt, i1 true)
+  store <4 x i32> %absabstt, ptr %out
+  ret void
+}