[llvm] 933182e - [RISCV] Improve support for forming widening multiplies when one input is a scalar splat.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 27 09:45:17 PDT 2021
Author: Craig Topper
Date: 2021-09-27T09:37:07-07:00
New Revision: 933182e948bbaf309dc9fd58bac875e7664503dd
URL: https://github.com/llvm/llvm-project/commit/933182e948bbaf309dc9fd58bac875e7664503dd
DIFF: https://github.com/llvm/llvm-project/commit/933182e948bbaf309dc9fd58bac875e7664503dd.diff
LOG: [RISCV] Improve support for forming widening multiplies when one input is a scalar splat.
If one input of a fixed vector multiply is a sign/zero extend and
the other operand is a splat of a scalar, we can use a widening
multiply if the scalar value has sufficient sign/zero bits.
Reviewed By: frasercrmck
Differential Revision: https://reviews.llvm.org/D110028
Added:
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ff8d2d948c5c..9b45ff188f25 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -6576,6 +6576,87 @@ static SDValue performANY_EXTENDCombine(SDNode *N,
return SDValue(N, 0);
}
+// Try to form VWMUL or VWMULU.
+// FIXME: Support VWMULSU.
+static SDValue combineMUL_VLToVWMUL(SDNode *N, SDValue Op0, SDValue Op1,
+ SelectionDAG &DAG) {
+ assert(N->getOpcode() == RISCVISD::MUL_VL && "Unexpected opcode");
+ bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL;
+ bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL;
+ if ((!IsSignExt && !IsZeroExt) || !Op0.hasOneUse())
+ return SDValue();
+
+ SDValue Mask = N->getOperand(2);
+ SDValue VL = N->getOperand(3);
+
+ // Make sure the mask and VL match.
+ if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL)
+ return SDValue();
+
+ MVT VT = N->getSimpleValueType(0);
+
+ // Determine the narrow size for a widening multiply.
+ unsigned NarrowSize = VT.getScalarSizeInBits() / 2;
+ MVT NarrowVT = MVT::getVectorVT(MVT::getIntegerVT(NarrowSize),
+ VT.getVectorElementCount());
+
+ SDLoc DL(N);
+
+ // See if the other operand is the same opcode.
+ if (Op0.getOpcode() == Op1.getOpcode()) {
+ if (!Op1.hasOneUse())
+ return SDValue();
+
+ // Make sure the mask and VL match.
+ if (Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL)
+ return SDValue();
+
+ Op1 = Op1.getOperand(0);
+ } else if (Op1.getOpcode() == RISCVISD::VMV_V_X_VL) {
+ // The operand is a splat of a scalar.
+
+ // The VL must be the same.
+ if (Op1.getOperand(1) != VL)
+ return SDValue();
+
+ // Get the scalar value.
+ Op1 = Op1.getOperand(0);
+
+ // See if have enough sign bits or zero bits in the scalar to use a
+ // widening multiply by splatting to smaller element size.
+ unsigned EltBits = VT.getScalarSizeInBits();
+ unsigned ScalarBits = Op1.getValueSizeInBits();
+ // Make sure we're getting all element bits from the scalar register.
+ // FIXME: Support implicit sign extension of vmv.v.x?
+ if (ScalarBits < EltBits)
+ return SDValue();
+
+ if (IsSignExt) {
+ if (DAG.ComputeNumSignBits(Op1) <= (ScalarBits - NarrowSize))
+ return SDValue();
+ } else {
+ APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize);
+ if (!DAG.MaskedValueIsZero(Op1, Mask))
+ return SDValue();
+ }
+
+ Op1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, Op1, VL);
+ } else
+ return SDValue();
+
+ Op0 = Op0.getOperand(0);
+
+ // Re-introduce narrower extends if needed.
+ unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL;
+ if (Op0.getValueType() != NarrowVT)
+ Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL);
+ if (Op1.getValueType() != NarrowVT)
+ Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
+
+ unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
+ return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL);
+}
+
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -7027,45 +7108,13 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case RISCVISD::MUL_VL: {
- // Try to form VWMUL or VWMULU.
- // FIXME: Look for splat of extended scalar as well.
- // FIXME: Support VWMULSU.
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
- bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL;
- bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL;
- if ((!IsSignExt && !IsZeroExt) || Op0.getOpcode() != Op1.getOpcode())
- return SDValue();
-
- // Make sure the extends have a single use.
- if (!Op0.hasOneUse() || !Op1.hasOneUse())
- return SDValue();
-
- SDValue Mask = N->getOperand(2);
- SDValue VL = N->getOperand(3);
- if (Op0.getOperand(1) != Mask || Op1.getOperand(1) != Mask ||
- Op0.getOperand(2) != VL || Op1.getOperand(2) != VL)
- return SDValue();
-
- Op0 = Op0.getOperand(0);
- Op1 = Op1.getOperand(0);
-
- MVT VT = N->getSimpleValueType(0);
- MVT NarrowVT =
- MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() / 2),
- VT.getVectorElementCount());
-
- SDLoc DL(N);
-
- // Re-introduce narrower extends if needed.
- unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL;
- if (Op0.getValueType() != NarrowVT)
- Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL);
- if (Op1.getValueType() != NarrowVT)
- Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
-
- unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
- return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL);
+ if (SDValue V = combineMUL_VLToVWMUL(N, Op0, Op1, DAG))
+ return V;
+ if (SDValue V = combineMUL_VLToVWMUL(N, Op1, Op0, DAG))
+ return V;
+ return SDValue();
}
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
index 58395e2845d3..025e3197f0f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
define <2 x i16> @vwmul_v2i16(<2 x i8>* %x, <2 x i8>* %y) {
; CHECK-LABEL: vwmul_v2i16:
@@ -649,3 +649,239 @@ define <16 x i64> @vwmul_vx_v16i64(<16 x i32>* %x, i32 %y) {
ret <16 x i64> %f
}
+define <8 x i16> @vwmul_vx_v8i16_i8(<8 x i8>* %x, i8* %y) {
+; CHECK-LABEL: vwmul_vx_v8i16_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT: vle8.v v25, (a0)
+; CHECK-NEXT: lb a0, 0(a1)
+; CHECK-NEXT: vwmul.vx v8, v25, a0
+; CHECK-NEXT: ret
+ %a = load <8 x i8>, <8 x i8>* %x
+ %b = load i8, i8* %y
+ %c = sext i8 %b to i16
+ %d = insertelement <8 x i16> undef, i16 %c, i32 0
+ %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer
+ %f = sext <8 x i8> %a to <8 x i16>
+ %g = mul <8 x i16> %e, %f
+ ret <8 x i16> %g
+}
+
+define <8 x i16> @vwmul_vx_v8i16_i16(<8 x i8>* %x, i16* %y) {
+; CHECK-LABEL: vwmul_vx_v8i16_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT: vle8.v v25, (a0)
+; CHECK-NEXT: lh a0, 0(a1)
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT: vsext.vf2 v26, v25
+; CHECK-NEXT: vmul.vx v8, v26, a0
+; CHECK-NEXT: ret
+ %a = load <8 x i8>, <8 x i8>* %x
+ %b = load i16, i16* %y
+ %d = insertelement <8 x i16> undef, i16 %b, i32 0
+ %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer
+ %f = sext <8 x i8> %a to <8 x i16>
+ %g = mul <8 x i16> %e, %f
+ ret <8 x i16> %g
+}
+
+define <4 x i32> @vwmul_vx_v4i32_i8(<4 x i16>* %x, i8* %y) {
+; CHECK-LABEL: vwmul_vx_v4i32_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT: vle16.v v25, (a0)
+; CHECK-NEXT: lb a0, 0(a1)
+; CHECK-NEXT: vwmul.vx v8, v25, a0
+; CHECK-NEXT: ret
+ %a = load <4 x i16>, <4 x i16>* %x
+ %b = load i8, i8* %y
+ %c = sext i8 %b to i32
+ %d = insertelement <4 x i32> undef, i32 %c, i32 0
+ %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer
+ %f = sext <4 x i16> %a to <4 x i32>
+ %g = mul <4 x i32> %e, %f
+ ret <4 x i32> %g
+}
+
+define <4 x i32> @vwmul_vx_v4i32_i16(<4 x i16>* %x, i16* %y) {
+; CHECK-LABEL: vwmul_vx_v4i32_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT: vle16.v v25, (a0)
+; CHECK-NEXT: lh a0, 0(a1)
+; CHECK-NEXT: vwmul.vx v8, v25, a0
+; CHECK-NEXT: ret
+ %a = load <4 x i16>, <4 x i16>* %x
+ %b = load i16, i16* %y
+ %c = sext i16 %b to i32
+ %d = insertelement <4 x i32> undef, i32 %c, i32 0
+ %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer
+ %f = sext <4 x i16> %a to <4 x i32>
+ %g = mul <4 x i32> %e, %f
+ ret <4 x i32> %g
+}
+
+define <4 x i32> @vwmul_vx_v4i32_i32(<4 x i16>* %x, i32* %y) {
+; CHECK-LABEL: vwmul_vx_v4i32_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT: vle16.v v25, (a0)
+; CHECK-NEXT: lw a0, 0(a1)
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vsext.vf2 v26, v25
+; CHECK-NEXT: vmul.vx v8, v26, a0
+; CHECK-NEXT: ret
+ %a = load <4 x i16>, <4 x i16>* %x
+ %b = load i32, i32* %y
+ %d = insertelement <4 x i32> undef, i32 %b, i32 0
+ %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer
+ %f = sext <4 x i16> %a to <4 x i32>
+ %g = mul <4 x i32> %e, %f
+ ret <4 x i32> %g
+}
+
+define <2 x i64> @vwmul_vx_v2i64_i8(<2 x i32>* %x, i8* %y) {
+; RV32-LABEL: vwmul_vx_v2i64_i8:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT: lb a1, 0(a1)
+; RV32-NEXT: vle32.v v25, (a0)
+; RV32-NEXT: srai a0, a1, 31
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v26, (a0), zero
+; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT: vsext.vf2 v27, v25
+; RV32-NEXT: vmul.vv v8, v26, v27
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwmul_vx_v2i64_i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT: vle32.v v25, (a0)
+; RV64-NEXT: lb a0, 0(a1)
+; RV64-NEXT: vwmul.vx v8, v25, a0
+; RV64-NEXT: ret
+ %a = load <2 x i32>, <2 x i32>* %x
+ %b = load i8, i8* %y
+ %c = sext i8 %b to i64
+ %d = insertelement <2 x i64> undef, i64 %c, i64 0
+ %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+ %f = sext <2 x i32> %a to <2 x i64>
+ %g = mul <2 x i64> %e, %f
+ ret <2 x i64> %g
+}
+
+define <2 x i64> @vwmul_vx_v2i64_i16(<2 x i32>* %x, i16* %y) {
+; RV32-LABEL: vwmul_vx_v2i64_i16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT: lh a1, 0(a1)
+; RV32-NEXT: vle32.v v25, (a0)
+; RV32-NEXT: srai a0, a1, 31
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v26, (a0), zero
+; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT: vsext.vf2 v27, v25
+; RV32-NEXT: vmul.vv v8, v26, v27
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwmul_vx_v2i64_i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT: vle32.v v25, (a0)
+; RV64-NEXT: lh a0, 0(a1)
+; RV64-NEXT: vwmul.vx v8, v25, a0
+; RV64-NEXT: ret
+ %a = load <2 x i32>, <2 x i32>* %x
+ %b = load i16, i16* %y
+ %c = sext i16 %b to i64
+ %d = insertelement <2 x i64> undef, i64 %c, i64 0
+ %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+ %f = sext <2 x i32> %a to <2 x i64>
+ %g = mul <2 x i64> %e, %f
+ ret <2 x i64> %g
+}
+
+define <2 x i64> @vwmul_vx_v2i64_i32(<2 x i32>* %x, i32* %y) {
+; RV32-LABEL: vwmul_vx_v2i64_i32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT: lw a1, 0(a1)
+; RV32-NEXT: vle32.v v25, (a0)
+; RV32-NEXT: srai a0, a1, 31
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v26, (a0), zero
+; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT: vsext.vf2 v27, v25
+; RV32-NEXT: vmul.vv v8, v26, v27
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwmul_vx_v2i64_i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT: vle32.v v25, (a0)
+; RV64-NEXT: lw a0, 0(a1)
+; RV64-NEXT: vwmul.vx v8, v25, a0
+; RV64-NEXT: ret
+ %a = load <2 x i32>, <2 x i32>* %x
+ %b = load i32, i32* %y
+ %c = sext i32 %b to i64
+ %d = insertelement <2 x i64> undef, i64 %c, i64 0
+ %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+ %f = sext <2 x i32> %a to <2 x i64>
+ %g = mul <2 x i64> %e, %f
+ ret <2 x i64> %g
+}
+
+define <2 x i64> @vwmul_vx_v2i64_i64(<2 x i32>* %x, i64* %y) {
+; RV32-LABEL: vwmul_vx_v2i64_i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT: lw a2, 4(a1)
+; RV32-NEXT: lw a1, 0(a1)
+; RV32-NEXT: vle32.v v25, (a0)
+; RV32-NEXT: sw a2, 12(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v26, (a0), zero
+; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT: vsext.vf2 v27, v25
+; RV32-NEXT: vmul.vv v8, v26, v27
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwmul_vx_v2i64_i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT: vle32.v v25, (a0)
+; RV64-NEXT: ld a0, 0(a1)
+; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT: vsext.vf2 v26, v25
+; RV64-NEXT: vmul.vx v8, v26, a0
+; RV64-NEXT: ret
+ %a = load <2 x i32>, <2 x i32>* %x
+ %b = load i64, i64* %y
+ %d = insertelement <2 x i64> undef, i64 %b, i64 0
+ %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+ %f = sext <2 x i32> %a to <2 x i64>
+ %g = mul <2 x i64> %e, %f
+ ret <2 x i64> %g
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
index 5c95f9ff1745..45083b530457 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
@@ -649,3 +649,246 @@ define <16 x i64> @vwmulu_vx_v16i64(<16 x i32>* %x, i32 %y) {
ret <16 x i64> %f
}
+define <8 x i16> @vwmulu_vx_v8i16_i8(<8 x i8>* %x, i8* %y) {
+; CHECK-LABEL: vwmulu_vx_v8i16_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT: vle8.v v25, (a0)
+; CHECK-NEXT: lbu a0, 0(a1)
+; CHECK-NEXT: vwmulu.vx v8, v25, a0
+; CHECK-NEXT: ret
+ %a = load <8 x i8>, <8 x i8>* %x
+ %b = load i8, i8* %y
+ %c = zext i8 %b to i16
+ %d = insertelement <8 x i16> undef, i16 %c, i32 0
+ %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer
+ %f = zext <8 x i8> %a to <8 x i16>
+ %g = mul <8 x i16> %e, %f
+ ret <8 x i16> %g
+}
+
+define <8 x i16> @vwmulu_vx_v8i16_i16(<8 x i8>* %x, i16* %y) {
+; CHECK-LABEL: vwmulu_vx_v8i16_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT: vle8.v v25, (a0)
+; CHECK-NEXT: lh a0, 0(a1)
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT: vzext.vf2 v26, v25
+; CHECK-NEXT: vmul.vx v8, v26, a0
+; CHECK-NEXT: ret
+ %a = load <8 x i8>, <8 x i8>* %x
+ %b = load i16, i16* %y
+ %d = insertelement <8 x i16> undef, i16 %b, i32 0
+ %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer
+ %f = zext <8 x i8> %a to <8 x i16>
+ %g = mul <8 x i16> %e, %f
+ ret <8 x i16> %g
+}
+
+define <4 x i32> @vwmulu_vx_v4i32_i8(<4 x i16>* %x, i8* %y) {
+; CHECK-LABEL: vwmulu_vx_v4i32_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT: vle16.v v25, (a0)
+; CHECK-NEXT: lbu a0, 0(a1)
+; CHECK-NEXT: vwmulu.vx v8, v25, a0
+; CHECK-NEXT: ret
+ %a = load <4 x i16>, <4 x i16>* %x
+ %b = load i8, i8* %y
+ %c = zext i8 %b to i32
+ %d = insertelement <4 x i32> undef, i32 %c, i32 0
+ %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer
+ %f = zext <4 x i16> %a to <4 x i32>
+ %g = mul <4 x i32> %e, %f
+ ret <4 x i32> %g
+}
+
+define <4 x i32> @vwmulu_vx_v4i32_i16(<4 x i16>* %x, i16* %y) {
+; CHECK-LABEL: vwmulu_vx_v4i32_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT: vle16.v v25, (a0)
+; CHECK-NEXT: lhu a0, 0(a1)
+; CHECK-NEXT: vwmulu.vx v8, v25, a0
+; CHECK-NEXT: ret
+ %a = load <4 x i16>, <4 x i16>* %x
+ %b = load i16, i16* %y
+ %c = zext i16 %b to i32
+ %d = insertelement <4 x i32> undef, i32 %c, i32 0
+ %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer
+ %f = zext <4 x i16> %a to <4 x i32>
+ %g = mul <4 x i32> %e, %f
+ ret <4 x i32> %g
+}
+
+define <4 x i32> @vwmulu_vx_v4i32_i32(<4 x i16>* %x, i32* %y) {
+; CHECK-LABEL: vwmulu_vx_v4i32_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT: vle16.v v25, (a0)
+; CHECK-NEXT: lw a0, 0(a1)
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vzext.vf2 v26, v25
+; CHECK-NEXT: vmul.vx v8, v26, a0
+; CHECK-NEXT: ret
+ %a = load <4 x i16>, <4 x i16>* %x
+ %b = load i32, i32* %y
+ %d = insertelement <4 x i32> undef, i32 %b, i32 0
+ %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer
+ %f = zext <4 x i16> %a to <4 x i32>
+ %g = mul <4 x i32> %e, %f
+ ret <4 x i32> %g
+}
+
+define <2 x i64> @vwmulu_vx_v2i64_i8(<2 x i32>* %x, i8* %y) {
+; RV32-LABEL: vwmulu_vx_v2i64_i8:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT: lb a1, 0(a1)
+; RV32-NEXT: vle32.v v25, (a0)
+; RV32-NEXT: srai a0, a1, 31
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v26, (a0), zero
+; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT: vzext.vf2 v27, v25
+; RV32-NEXT: vmul.vv v8, v26, v27
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwmulu_vx_v2i64_i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT: vle32.v v25, (a0)
+; RV64-NEXT: lb a0, 0(a1)
+; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT: vzext.vf2 v26, v25
+; RV64-NEXT: vmul.vx v8, v26, a0
+; RV64-NEXT: ret
+ %a = load <2 x i32>, <2 x i32>* %x
+ %b = load i8, i8* %y
+ %c = zext i8 %b to i64
+ %d = insertelement <2 x i64> undef, i64 %c, i64 0
+ %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+ %f = zext <2 x i32> %a to <2 x i64>
+ %g = mul <2 x i64> %e, %f
+ ret <2 x i64> %g
+}
+
+define <2 x i64> @vwmulu_vx_v2i64_i16(<2 x i32>* %x, i16* %y) {
+; RV32-LABEL: vwmulu_vx_v2i64_i16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT: lh a1, 0(a1)
+; RV32-NEXT: vle32.v v25, (a0)
+; RV32-NEXT: srai a0, a1, 31
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v26, (a0), zero
+; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT: vzext.vf2 v27, v25
+; RV32-NEXT: vmul.vv v8, v26, v27
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwmulu_vx_v2i64_i16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT: vle32.v v25, (a0)
+; RV64-NEXT: lh a0, 0(a1)
+; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT: vzext.vf2 v26, v25
+; RV64-NEXT: vmul.vx v8, v26, a0
+; RV64-NEXT: ret
+ %a = load <2 x i32>, <2 x i32>* %x
+ %b = load i16, i16* %y
+ %c = zext i16 %b to i64
+ %d = insertelement <2 x i64> undef, i64 %c, i64 0
+ %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+ %f = zext <2 x i32> %a to <2 x i64>
+ %g = mul <2 x i64> %e, %f
+ ret <2 x i64> %g
+}
+
+define <2 x i64> @vwmulu_vx_v2i64_i32(<2 x i32>* %x, i32* %y) {
+; RV32-LABEL: vwmulu_vx_v2i64_i32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT: lw a1, 0(a1)
+; RV32-NEXT: vle32.v v25, (a0)
+; RV32-NEXT: srai a0, a1, 31
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: sw a0, 12(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v26, (a0), zero
+; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT: vzext.vf2 v27, v25
+; RV32-NEXT: vmul.vv v8, v26, v27
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwmulu_vx_v2i64_i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT: vle32.v v25, (a0)
+; RV64-NEXT: lw a0, 0(a1)
+; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT: vzext.vf2 v26, v25
+; RV64-NEXT: vmul.vx v8, v26, a0
+; RV64-NEXT: ret
+ %a = load <2 x i32>, <2 x i32>* %x
+ %b = load i32, i32* %y
+ %c = zext i32 %b to i64
+ %d = insertelement <2 x i64> undef, i64 %c, i64 0
+ %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+ %f = zext <2 x i32> %a to <2 x i64>
+ %g = mul <2 x i64> %e, %f
+ ret <2 x i64> %g
+}
+
+define <2 x i64> @vwmulu_vx_v2i64_i64(<2 x i32>* %x, i64* %y) {
+; RV32-LABEL: vwmulu_vx_v2i64_i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT: lw a2, 4(a1)
+; RV32-NEXT: lw a1, 0(a1)
+; RV32-NEXT: vle32.v v25, (a0)
+; RV32-NEXT: sw a2, 12(sp)
+; RV32-NEXT: sw a1, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vlse64.v v26, (a0), zero
+; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT: vzext.vf2 v27, v25
+; RV32-NEXT: vmul.vv v8, v26, v27
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vwmulu_vx_v2i64_i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT: vle32.v v25, (a0)
+; RV64-NEXT: ld a0, 0(a1)
+; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT: vzext.vf2 v26, v25
+; RV64-NEXT: vmul.vx v8, v26, a0
+; RV64-NEXT: ret
+ %a = load <2 x i32>, <2 x i32>* %x
+ %b = load i64, i64* %y
+ %d = insertelement <2 x i64> undef, i64 %b, i64 0
+ %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+ %f = zext <2 x i32> %a to <2 x i64>
+ %g = mul <2 x i64> %e, %f
+ ret <2 x i64> %g
+}
+
More information about the llvm-commits
mailing list