[llvm] [Matrix] Lower vector reductions using shape info (PR #142055)

Mon Jun 23 11:54:54 PDT 2025

================
@@ -0,0 +1,300 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define i32 @reduce_add_4x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @reduce_add_4x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load volatile <4 x i32>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 4
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load volatile <4 x i32>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[COL_LOAD]], [[COL_LOAD1]]
+; CHECK-NEXT:    [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[REDUCE]]
+;
+  %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2)
+  %reduce = call i32 @llvm.vector.reduce.add(<8 x i32> %inv)
+  ret i32 %reduce
+}
+
+define i32 @reduce_add_8x1(ptr %in, ptr %out) {
+; CHECK-LABEL: @reduce_add_8x1(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load volatile <8 x i32>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[COL_LOAD]])
+; CHECK-NEXT:    ret i32 [[REDUCE]]
+;
+  %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 8, i1 1, i32 8, i32 1)
+  %reduce = call i32 @llvm.vector.reduce.add(<8 x i32> %inv)
+  ret i32 %reduce
+}
+
+define i32 @reduce_add_1x8(ptr %in, ptr %out) {
+; CHECK-LABEL: @reduce_add_1x8(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load volatile <1 x i32>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 1
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr i32, ptr [[IN]], i64 3
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP4]], align 4
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr i32, ptr [[IN]], i64 4
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP6]], align 4
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr i32, ptr [[IN]], i64 5
+; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP8]], align 4
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i32, ptr [[IN]], i64 6
+; CHECK-NEXT:    [[COL_LOAD11:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP10]], align 4
+; CHECK-NEXT:    [[VEC_GEP12:%.*]] = getelementptr i32, ptr [[IN]], i64 7
+; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP12]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add <1 x i32> [[COL_LOAD]], [[COL_LOAD1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <1 x i32> [[TMP1]], [[COL_LOAD3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <1 x i32> [[TMP2]], [[COL_LOAD5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <1 x i32> [[TMP3]], [[COL_LOAD7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <1 x i32> [[TMP4]], [[COL_LOAD9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <1 x i32> [[TMP5]], [[COL_LOAD11]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <1 x i32> [[TMP6]], [[COL_LOAD13]]
+; CHECK-NEXT:    [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> [[TMP7]])
----------------
fhahn wrote:

(It's not completely now, we already get the similarly bad bevhavior for other ops, but with the reductions it's probably even worth, so it might be worth tackling before going further down that road)

https://github.com/llvm/llvm-project/pull/142055