[llvm] 131b7fe - [RISCV][VLOPT] Add support for widening integer mul-add instructions (#112219)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 6 12:03:46 PST 2024
Author: Michael Maitland
Date: 2024-12-06T15:03:43-05:00
New Revision: 131b7fe2b108fb33efd031371f0f8a993e374860
URL: https://github.com/llvm/llvm-project/commit/131b7fe2b108fb33efd031371f0f8a993e374860
DIFF: https://github.com/llvm/llvm-project/commit/131b7fe2b108fb33efd031371f0f8a993e374860.diff
LOG: [RISCV][VLOPT] Add support for widening integer mul-add instructions (#112219)
This adds support for these instructions and also tests getOperandInfo
for these instructions as well. I think the VL on the using add
instruction can be optimized further, once we add support for optimizing
non-vlmax.
Added:
Modified:
llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index d4c1dd7dc549bf..e75eddb423e4b9 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -403,7 +403,19 @@ static OperandInfo getOperandInfo(const MachineInstr &MI,
case RISCV::VWMULSU_VV:
case RISCV::VWMULSU_VX:
case RISCV::VWMULU_VV:
- case RISCV::VWMULU_VX: {
+ case RISCV::VWMULU_VX:
+ // Vector Widening Integer Multiply-Add Instructions
+ // Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
+ // A SEW-bit*SEW-bit multiply of the sources forms a 2*SEW-bit value, which
+ // is then added to the 2*SEW-bit Dest. These instructions never have a
+ // passthru operand.
+ case RISCV::VWMACCU_VV:
+ case RISCV::VWMACCU_VX:
+ case RISCV::VWMACC_VV:
+ case RISCV::VWMACC_VX:
+ case RISCV::VWMACCSU_VV:
+ case RISCV::VWMACCSU_VX:
+ case RISCV::VWMACCUS_VX: {
unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW;
RISCVII::VLMUL EMUL =
IsMODef ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
@@ -418,18 +430,7 @@ static OperandInfo getOperandInfo(const MachineInstr &MI,
case RISCV::VWADD_WV:
case RISCV::VWADD_WX:
case RISCV::VWSUB_WV:
- case RISCV::VWSUB_WX:
- // Vector Widening Integer Multiply-Add Instructions
- // Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
- // Even though the add is a 2*SEW addition, the operands of the add are the
- // Dest which is 2*SEW and the result of the multiply which is 2*SEW.
- case RISCV::VWMACCU_VV:
- case RISCV::VWMACCU_VX:
- case RISCV::VWMACC_VV:
- case RISCV::VWMACC_VX:
- case RISCV::VWMACCSU_VV:
- case RISCV::VWMACCSU_VX:
- case RISCV::VWMACCUS_VX: {
+ case RISCV::VWSUB_WX: {
bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
bool TwoTimes = IsMODef || IsOp1;
unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
@@ -571,9 +572,13 @@ static bool isSupportedInstr(const MachineInstr &MI) {
// Vector Single-Width Integer Multiply-Add Instructions
// FIXME: Add support
// Vector Widening Integer Multiply-Add Instructions
- // FIXME: Add support
- case RISCV::VWMACC_VX:
+ case RISCV::VWMACCU_VV:
case RISCV::VWMACCU_VX:
+ case RISCV::VWMACC_VV:
+ case RISCV::VWMACC_VX:
+ case RISCV::VWMACCSU_VV:
+ case RISCV::VWMACCSU_VX:
+ case RISCV::VWMACCUS_VX:
// Vector Integer Merge Instructions
// FIXME: Add support
// Vector Integer Move Instructions
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index 11f603b56b6e56..39cc90b812f99e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -1248,44 +1248,149 @@ define <vscale x 4 x i64> @vwmulu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
ret <vscale x 4 x i64> %2
}
-define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
+define <vscale x 4 x i32> @vwmacc_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, <vscale x 4 x i32> %d, iXLen %vl) {
+; NOVLOPT-LABEL: vwmacc_vv:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmacc.vv v8, v10, v11
+; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v12
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmacc_vv:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; VLOPT-NEXT: vwmacc.vv v8, v10, v11
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v12
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %d, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
; NOVLOPT-LABEL: vwmacc_vx:
; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vwmacc.vx v10, a0, v8
+; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmacc.vx v8, a0, v10
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
; NOVLOPT-NEXT: ret
;
; VLOPT-LABEL: vwmacc_vx:
; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; VLOPT-NEXT: vwmacc.vx v10, a0, v8
+; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; VLOPT-NEXT: vwmacc.vx v8, a0, v10
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v10
+; VLOPT-NEXT: vadd.vv v8, v8, v8
; VLOPT-NEXT: ret
- %1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
-define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
+define <vscale x 4 x i32> @vwmaccu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, <vscale x 4 x i32> %d, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccu_vv:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccu.vv v8, v10, v11
+; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v12
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmaccu_vv:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccu.vv v8, v10, v11
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v12
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %d, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, <vscale x 4 x i32> %d, i32 %e, iXLen %vl) {
; NOVLOPT-LABEL: vwmaccu_vx:
; NOVLOPT: # %bb.0:
-; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; NOVLOPT-NEXT: vwmaccu.vx v10, a0, v8
-; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
-; NOVLOPT-NEXT: vadd.vv v8, v10, v10
+; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccu.vx v8, a0, v10
+; NOVLOPT-NEXT: vsetvli zero, a2, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v12
; NOVLOPT-NEXT: ret
;
; VLOPT-LABEL: vwmaccu_vx:
; VLOPT: # %bb.0:
-; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; VLOPT-NEXT: vwmaccu.vx v10, a0, v8
+; VLOPT-NEXT: vsetvli zero, a2, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccu.vx v8, a0, v10
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v12
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %d, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccsu_vv:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccsu.vv v8, v10, v11
+; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmaccsu_vv:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccsu.vv v8, v10, v11
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; VLOPT-NEXT: vadd.vv v8, v10, v10
+; VLOPT-NEXT: vadd.vv v8, v8, v8
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccsu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccsu_vx:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccsu.vx v8, a0, v10
+; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmaccsu_vx:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccsu.vx v8, a0, v10
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v8
+; VLOPT-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccus_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccus_vx:
+; NOVLOPT: # %bb.0:
+; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
+; NOVLOPT-NEXT: vwmaccus.vx v8, a0, v10
+; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT: vadd.vv v8, v8, v8
+; NOVLOPT-NEXT: ret
+;
+; VLOPT-LABEL: vwmaccus_vx:
+; VLOPT: # %bb.0:
+; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
+; VLOPT-NEXT: vwmaccus.vx v8, a0, v10
+; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT: vadd.vv v8, v8, v8
; VLOPT-NEXT: ret
- %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
ret <vscale x 4 x i32> %2
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
index ff4f3e24ec17b0..1efe96c975754a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
@@ -136,3 +136,17 @@ define <vscale x 4 x i32> @
diff erent_imm_vl_with_tu(<vscale x 4 x i32> %passthru
%w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a,iXLen 4)
ret <vscale x 4 x i32> %w
}
+
+define <vscale x 4 x i32> @dont_optimize_tied_def(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
+; CHECK-LABEL: dont_optimize_tied_def:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, tu, ma
+; CHECK-NEXT: vwmacc.vv v8, v10, v11
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vwmacc.vv v8, v10, v11
+; CHECK-NEXT: ret
+ %1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
+ %2 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %1, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl, iXLen 0)
+ ret <vscale x 4 x i32> %2
+}
+
More information about the llvm-commits
mailing list