[llvm] [AArch64][Codegen]Transform saturating smull to sqdmulh (PR #143671)
Nashe Mncube via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 26 03:46:42 PDT 2025
https://github.com/nasherm updated https://github.com/llvm/llvm-project/pull/143671
>From db2aaabfced58858c9fa98a4617ebbcdfe8cba40 Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Tue, 10 Jun 2025 16:20:42 +0100
Subject: [PATCH 1/2] [AArch64][Codegen]Transform saturating smull to sqdmulh
This patch adds a pattern for recognizing saturating vector
smull. Prior to this patch these were performed using a
combination of smull+smull2+uzp+smin like the following
```
smull2 v5.2d, v1.4s, v2.4s
smull v1.2d, v1.2s, v2.2s
uzp2 v1.4s, v1.4s, v5.4s
smin v1.4s, v1.4s, v0.4s
add v1.4s, v1.4s, v1.4s
```
which now optimizes to
```
sqdmulh v0.4s, v1.4s, v0.4s
sshr v0.4s, v0.4s, #1
add v0.4s, v0.4s, v0.4s
```
This only operates on vectors containing Q31 data types.
Change-Id: Ib7d4d5284d1bd3fdd0907365f9e2f37f4da14671
---
.../Target/AArch64/AArch64ISelLowering.cpp | 73 +++++++++++++++++++
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 7 ++
.../CodeGen/AArch64/saturating-vec-smull.ll | 25 +++++++
3 files changed, 105 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9f51caef6d228..4abe7af42aba8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26356,6 +26356,77 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return NVCAST;
}
+// A special combine for the vqdmulh family of instructions. This is one of the
+// potential set of patterns that could patch this instruction. The base pattern
+// vshl(smin(uzp(smull, smull2), 1) can be reduced to vshl(vshr(sqdmulh(...),
+// 1), 1) when operating on Q31 data types
+static SDValue performVSHLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+
+ SDValue Op0 = N->getOperand(0);
+ ConstantSDNode *Splat = isConstOrConstSplat(N->getOperand(1));
+
+ if (Op0.getOpcode() != ISD::SMIN || !Splat || !Splat->isOne())
+ return SDValue();
+
+ auto trySQDMULHCombine = [](SDNode *N, SelectionDAG &DAG) -> SDValue {
+ EVT VT = N->getValueType(0);
+
+ if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
+ return SDValue();
+
+ ConstantSDNode *Clamp;
+
+ if (N->getOpcode() != ISD::SMIN)
+ return SDValue();
+
+ Clamp = isConstOrConstSplat(N->getOperand(1));
+
+ if (!Clamp) {
+ return SDValue();
+ }
+
+ MVT ScalarType;
+ int ShftAmt = 0;
+ // Here we are considering clamped Arm Q format
+ // data types which uses 2 upper bits, one for the
+ // integer part and one for the sign.
+ switch (Clamp->getSExtValue()) {
+ case (1ULL << 30) - 1:
+ ScalarType = MVT::i32;
+ ShftAmt = 32;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue Mulhs = N->getOperand(0);
+ if (Mulhs.getOpcode() != ISD::MULHS)
+ return SDValue();
+
+ SDValue V0 = Mulhs.getOperand(0);
+ SDValue V1 = Mulhs.getOperand(1);
+
+ SDLoc DL(Mulhs);
+ const unsigned LegalLanes = 128 / ShftAmt;
+ EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
+ return DAG.getNode(AArch64ISD::SQDMULH, DL, LegalVecVT, V0, V1);
+ };
+
+ if (SDValue Val = trySQDMULHCombine(Op0.getNode(), DAG)) {
+ SDLoc DL(N);
+ EVT VecVT = N->getOperand(0).getValueType();
+ // Clear lower bits for correctness
+ SDValue RightShift =
+ DAG.getNode(AArch64ISD::VASHR, DL, VecVT, Val, N->getOperand(1));
+ return DAG.getNode(AArch64ISD::VSHL, DL, VecVT, RightShift,
+ N->getOperand(1));
+ }
+
+ return SDValue();
+}
+
/// If the operand is a bitwise AND with a constant RHS, and the shift has a
/// constant RHS and is the only use, we can pull it out of the shift, i.e.
///
@@ -26496,6 +26567,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performMaskedGatherScatterCombine(N, DCI, DAG);
case ISD::FP_EXTEND:
return performFPExtendCombine(N, DAG, DCI, Subtarget);
+ case AArch64ISD::VSHL:
+ return performVSHLCombine(N, DCI, DAG);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
case AArch64ISD::TBNZ:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 727831896737d..75910fb47c663 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -992,6 +992,7 @@ def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull,
[SDNPCommutative]>;
def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull,
[SDNPCommutative]>;
+def AArch64sqdmulh : SDNode<"AArch64ISD::SQDMULH", SDT_AArch64mull>;
// Reciprocal estimates and steps.
def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
@@ -1194,6 +1195,7 @@ def AArch64gld1q_index_merge_zero
: SDNode<"AArch64ISD::GLD1Q_INDEX_MERGE_ZERO", SDTypeProfile<1, 4, []>,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
// have no common bits.
def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),
@@ -8262,6 +8264,7 @@ def : Pat<(v2f64 (any_fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+
defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
int_aarch64_neon_sqdmulh_laneq>;
defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane,
@@ -9365,6 +9368,10 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+
+def : Pat<(v4i32 (AArch64sqdmulh (v4i32 V128:$Rn), (v4i32 V128:$Rm))),
+ (SQDMULHv4i32 V128:$Rn, V128:$Rm)>;
+
// Conversions within AdvSIMD types in the same register size are free.
// But because we need a consistent lane ordering, in big endian many
// conversions require one or more REV instructions.
diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
new file mode 100644
index 0000000000000..c1bb370ac3e89
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
+
+define <4 x i32> @arm_mult_q31(ptr %0, ptr %1){
+; CHECK-LABEL: arm_mult_q31:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: sqdmulh v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: sshr v0.4s, v0.4s, #1
+; CHECK-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: ret
+ %7 = getelementptr i8, ptr %0, i64 0
+ %9 = getelementptr i8, ptr %1, i64 0
+ %12 = load <4 x i32>, ptr %7, align 4
+ %13 = sext <4 x i32> %12 to <4 x i64>
+ %14 = load <4 x i32>, ptr %9, align 4
+ %15 = sext <4 x i32> %14 to <4 x i64>
+ %16 = mul nsw <4 x i64> %15, %13
+ %17 = lshr <4 x i64> %16, splat (i64 32)
+ %18 = trunc nuw <4 x i64> %17 to <4 x i32>
+ %19 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %18, <4 x i32> splat (i32 1073741823))
+ %20 = shl <4 x i32> %19, splat (i32 1)
+ ret <4 x i32> %20
+}
>From 30ea16bd37c31616fc6b44bfae87b80c6ad49beb Mon Sep 17 00:00:00 2001
From: nasmnc01 <nashe.mncube at arm.com>
Date: Thu, 26 Jun 2025 11:27:14 +0100
Subject: [PATCH 2/2] Responding to review comments
Based on the most recent PR comments I've
- refactored the change to work on a reduced pattern
which is truer to the actual SQDMULH instruction
- written pattern matches for q31, q15 and int32, int16
data types
- rewritten and extended the tests
Change-Id: I18c05e56b3979b8dd757d533e44a65496434937b
---
.../Target/AArch64/AArch64ISelLowering.cpp | 153 +++++++++---------
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 3 +
.../CodeGen/AArch64/saturating-vec-smull.ll | 69 +++++---
3 files changed, 134 insertions(+), 91 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4abe7af42aba8..6bf65700c9c38 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20717,6 +20717,83 @@ static SDValue performBuildVectorCombine(SDNode *N,
return SDValue();
}
+// A special combine for the vqdmulh family of instructions.
+// truncate( smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
+// SATURATING_VAL ) can be reduced to sqdmulh(...)
+static SDValue trySQDMULHCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+
+ if (N->getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
+ return SDValue();
+
+ SDValue SMin = N->getOperand(0);
+
+ if (SMin.getOpcode() != ISD::SMIN)
+ return SDValue();
+
+ ConstantSDNode *Clamp = isConstOrConstSplat(SMin.getOperand(1));
+
+ if (!Clamp)
+ return SDValue();
+
+ MVT ScalarType;
+ unsigned ShiftAmt = 0;
+ // Here we are considering clamped Arm Q format
+ // data types which use 2 upper bits, one for the
+ // integer part and one for the sign. We also consider
+ // standard signed integer types
+ switch (Clamp->getSExtValue()) {
+ case (1ULL << 14) - 1: // Q15 saturation
+ case (1ULL << 15) - 1:
+ ScalarType = MVT::i16;
+ ShiftAmt = 16;
+ break;
+ case (1ULL << 30) - 1: // Q31 saturation
+ case (1ULL << 31) - 1:
+ ScalarType = MVT::i32;
+ ShiftAmt = 32;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue Sra = SMin.getOperand(0);
+ if (Sra.getOpcode() != ISD::SRA)
+ return SDValue();
+
+ ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
+ if (!RightShiftVec)
+ return SDValue();
+ unsigned SExtValue = RightShiftVec->getSExtValue();
+
+ if (SExtValue != ShiftAmt && SExtValue != (ShiftAmt - 1))
+ return SDValue();
+
+ SDValue Mul = Sra.getOperand(0);
+ if (Mul.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ SDValue SExt0 = Mul.getOperand(0);
+ SDValue SExt1 = Mul.getOperand(1);
+
+ if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
+ SExt1.getOpcode() != ISD::SIGN_EXTEND)
+ return SDValue();
+
+ SDValue V0 = SExt0.getOperand(0);
+ SDValue V1 = SExt1.getOperand(0);
+
+ SDLoc DL(N);
+ EVT VecVT = N->getValueType(0);
+ return DAG.getNode(AArch64ISD::SQDMULH, DL, VecVT, V0, V1);
+}
+
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
@@ -20731,6 +20808,9 @@ static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N0.getOpcode(), DL, VT, Op);
}
+ if (SDValue V = trySQDMULHCombine(N, DCI, DAG))
+ return V;
+
// Performing the following combine produces a preferable form for ISEL.
// i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
@@ -26356,77 +26436,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return NVCAST;
}
-// A special combine for the vqdmulh family of instructions. This is one of the
-// potential set of patterns that could patch this instruction. The base pattern
-// vshl(smin(uzp(smull, smull2), 1) can be reduced to vshl(vshr(sqdmulh(...),
-// 1), 1) when operating on Q31 data types
-static SDValue performVSHLCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
-
- SDValue Op0 = N->getOperand(0);
- ConstantSDNode *Splat = isConstOrConstSplat(N->getOperand(1));
-
- if (Op0.getOpcode() != ISD::SMIN || !Splat || !Splat->isOne())
- return SDValue();
-
- auto trySQDMULHCombine = [](SDNode *N, SelectionDAG &DAG) -> SDValue {
- EVT VT = N->getValueType(0);
-
- if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
- return SDValue();
-
- ConstantSDNode *Clamp;
-
- if (N->getOpcode() != ISD::SMIN)
- return SDValue();
-
- Clamp = isConstOrConstSplat(N->getOperand(1));
-
- if (!Clamp) {
- return SDValue();
- }
-
- MVT ScalarType;
- int ShftAmt = 0;
- // Here we are considering clamped Arm Q format
- // data types which uses 2 upper bits, one for the
- // integer part and one for the sign.
- switch (Clamp->getSExtValue()) {
- case (1ULL << 30) - 1:
- ScalarType = MVT::i32;
- ShftAmt = 32;
- break;
- default:
- return SDValue();
- }
-
- SDValue Mulhs = N->getOperand(0);
- if (Mulhs.getOpcode() != ISD::MULHS)
- return SDValue();
-
- SDValue V0 = Mulhs.getOperand(0);
- SDValue V1 = Mulhs.getOperand(1);
-
- SDLoc DL(Mulhs);
- const unsigned LegalLanes = 128 / ShftAmt;
- EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
- return DAG.getNode(AArch64ISD::SQDMULH, DL, LegalVecVT, V0, V1);
- };
-
- if (SDValue Val = trySQDMULHCombine(Op0.getNode(), DAG)) {
- SDLoc DL(N);
- EVT VecVT = N->getOperand(0).getValueType();
- // Clear lower bits for correctness
- SDValue RightShift =
- DAG.getNode(AArch64ISD::VASHR, DL, VecVT, Val, N->getOperand(1));
- return DAG.getNode(AArch64ISD::VSHL, DL, VecVT, RightShift,
- N->getOperand(1));
- }
-
- return SDValue();
-}
-
/// If the operand is a bitwise AND with a constant RHS, and the shift has a
/// constant RHS and is the only use, we can pull it out of the shift, i.e.
///
@@ -26567,8 +26576,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performMaskedGatherScatterCombine(N, DCI, DAG);
case ISD::FP_EXTEND:
return performFPExtendCombine(N, DAG, DCI, Subtarget);
- case AArch64ISD::VSHL:
- return performVSHLCombine(N, DCI, DAG);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
case AArch64ISD::TBNZ:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 75910fb47c663..755c3a22ffa89 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9369,6 +9369,9 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
(UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
+def : Pat<(v8i16 (AArch64sqdmulh (v8i16 V128:$Rn), (v8i16 V128:$Rm))),
+ (SQDMULHv8i16 V128:$Rn, V128:$Rm)>;
+
def : Pat<(v4i32 (AArch64sqdmulh (v4i32 V128:$Rn), (v4i32 V128:$Rm))),
(SQDMULHv4i32 V128:$Rn, V128:$Rm)>;
diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
index c1bb370ac3e89..2bc1a427a6b99 100644
--- a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
+++ b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
@@ -1,25 +1,58 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
-define <4 x i32> @arm_mult_q31(ptr %0, ptr %1){
-; CHECK-LABEL: arm_mult_q31:
+define <8 x i16> @saturating_int16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: saturating_int16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmulh v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: ret
+ %as = sext <8 x i16> %a to <8 x i32>
+ %bs = sext <8 x i16> %b to <8 x i32>
+ %m = mul <8 x i32> %bs, %as
+ %sh = ashr <8 x i32> %m, splat (i32 15)
+ %ma = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 32767))
+ %t = trunc <8 x i32> %ma to <8 x i16>
+ ret <8 x i16> %t
+}
+
+define <4 x i32> @saturating_int32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: saturating_int32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmulh v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: ret
+ %as = sext <4 x i32> %a to <4 x i64>
+ %bs = sext <4 x i32> %b to <4 x i64>
+ %m = mul <4 x i64> %bs, %as
+ %sh = ashr <4 x i64> %m, splat (i64 31)
+ %ma = tail call <4 x i64> @llvm.smin.v8i32(<4 x i64> %sh, <4 x i64> splat (i64 2147483647))
+ %t = trunc <4 x i64> %ma to <4 x i32>
+ ret <4 x i32> %t
+}
+
+define <8 x i16> @saturating_q15(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: saturating_q15:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmulh v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: ret
+ %as = sext <8 x i16> %a to <8 x i32>
+ %bs = sext <8 x i16> %b to <8 x i32>
+ %m = mul <8 x i32> %bs, %as
+ %sh = ashr <8 x i32> %m, splat (i32 16)
+ %ma = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 16383))
+ %t = trunc <8 x i32> %ma to <8 x i16>
+ ret <8 x i16> %t
+}
+
+define <4 x i32> @saturating_q31(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: saturating_q31:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: sqdmulh v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: sshr v0.4s, v0.4s, #1
-; CHECK-NEXT: add v0.4s, v0.4s, v0.4s
; CHECK-NEXT: ret
- %7 = getelementptr i8, ptr %0, i64 0
- %9 = getelementptr i8, ptr %1, i64 0
- %12 = load <4 x i32>, ptr %7, align 4
- %13 = sext <4 x i32> %12 to <4 x i64>
- %14 = load <4 x i32>, ptr %9, align 4
- %15 = sext <4 x i32> %14 to <4 x i64>
- %16 = mul nsw <4 x i64> %15, %13
- %17 = lshr <4 x i64> %16, splat (i64 32)
- %18 = trunc nuw <4 x i64> %17 to <4 x i32>
- %19 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %18, <4 x i32> splat (i32 1073741823))
- %20 = shl <4 x i32> %19, splat (i32 1)
- ret <4 x i32> %20
+ %as = sext <4 x i32> %a to <4 x i64>
+ %bs = sext <4 x i32> %b to <4 x i64>
+ %m = mul <4 x i64> %bs, %as
+ %sh = ashr <4 x i64> %m, splat (i64 32)
+ %ma = tail call <4 x i64> @llvm.smin.v8i32(<4 x i64> %sh, <4 x i64> splat (i64 1073741823))
+ %t = trunc <4 x i64> %ma to <4 x i32>
+ ret <4 x i32> %t
}
More information about the llvm-commits
mailing list