[clang] [llvm] [ARM] Fix instruction selection for MVE vsbciq intrinsic (PR #118284)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Dec 2 03:26:42 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-arm
Author: Oliver Stannard (ostannard)
<details>
<summary>Changes</summary>
There were two bugs in the implementation of the MVE vsbciq (subtract with carry across vector, with initial carry value) intrinsics:
* The VSBCI instruction behaves as if the carry-in is always set, but we were selecting it when the carry-in is clear.
* The vsbciq intrinsics should generate IR with the carry-in set, but they were leaving it clear.
These two bugs almost cancelled each other out, but resulted in incorrect code when the vsbcq intrinsics (with a carry-in) were used, and the carry-in was a compile time constant.
---
Full diff: https://github.com/llvm/llvm-project/pull/118284.diff
5 Files Affected:
- (modified) clang/include/clang/Basic/arm_mve.td (+5-5)
- (modified) clang/test/CodeGen/arm-mve-intrinsics/vadc.c (+4-4)
- (modified) llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll (+90-2)
- (modified) llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll (+1-1)
``````````diff
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 93abbc47c54dd5..6dd8c52ddfd772 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -1270,13 +1270,13 @@ defm sqrshr: ScalarSaturatingShiftReg<s32, s64>;
def lsll: LongScalarShift<u64, (args s32:$sh), (IRInt<"lsll"> $lo, $hi, $sh)>;
def asrl: LongScalarShift<s64, (args s32:$sh), (IRInt<"asrl"> $lo, $hi, $sh)>;
-multiclass vadcsbc {
+multiclass vadcsbc<dag initial_carry_in> {
def q: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
(seq (IRInt<NAME, [Vector]> $a, $b, (shl (load $carry), 29)):$pair,
(store (and 1, (lshr (xval $pair, 1), 29)), $carry),
(xval $pair, 0))>;
def iq: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
- (seq (IRInt<NAME, [Vector]> $a, $b, 0):$pair,
+ (seq (IRInt<NAME, [Vector]> $a, $b, initial_carry_in):$pair,
(store (and 1, (lshr (xval $pair, 1), 29)), $carry),
(xval $pair, 0))>;
def q_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
@@ -1288,13 +1288,13 @@ multiclass vadcsbc {
def iq_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
Ptr<uint>:$carry, Predicate:$pred),
(seq (IRInt<NAME # "_predicated", [Vector, Predicate]> $inactive, $a, $b,
- 0, $pred):$pair,
+ initial_carry_in, $pred):$pair,
(store (and 1, (lshr (xval $pair, 1), 29)), $carry),
(xval $pair, 0))>;
}
let params = T.Int32 in {
- defm vadc: vadcsbc;
- defm vsbc: vadcsbc;
+ defm vadc: vadcsbc<(u32 0)>;
+ defm vsbc: vadcsbc<(shl 1, 29)>;
}
let params = T.Int in {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
index 21087b83300c88..29e067d65c6c89 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
@@ -92,7 +92,7 @@ int32x4_t test_vadcq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsigne
// CHECK-LABEL: @test_vsbciq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
// CHECK-NEXT: [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -110,7 +110,7 @@ int32x4_t test_vsbciq_s32(int32x4_t a, int32x4_t b, unsigned *carry_out) {
// CHECK-LABEL: @test_vsbciq_u32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
// CHECK-NEXT: [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -170,7 +170,7 @@ uint32x4_t test_vsbcq_u32(uint32x4_t a, uint32x4_t b, unsigned *carry) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
// CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
// CHECK-NEXT: [[TMP5:%.*]] = and i32 1, [[TMP4]]
@@ -190,7 +190,7 @@ int32x4_t test_vsbciq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsign
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
// CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
// CHECK-NEXT: [[TMP5:%.*]] = and i32 1, [[TMP4]]
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 73ee8cf81adcd6..9714370ec7cc8e 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -5229,7 +5229,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
return;
case Intrinsic::arm_mve_vsbc:
case Intrinsic::arm_mve_vsbc_predicated:
- SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true,
+ SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, false,
IntNo == Intrinsic::arm_mve_vsbc_predicated);
return;
case Intrinsic::arm_mve_vshlc:
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
index 967e8a94900e16..2b4c478dfe3e6a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
@@ -108,7 +108,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_s32(<4 x i32> %a, <4 x i32> %b, pt
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
- %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+ %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
%1 = extractvalue { <4 x i32>, i32 } %0, 1
%2 = lshr i32 %1, 29
%3 = and i32 %2, 1
@@ -125,6 +125,46 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_u32(<4 x i32> %a, <4 x i32> %b, pt
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
+ %1 = extractvalue { <4 x i32>, i32 } %0, 1
+ %2 = lshr i32 %1, 29
+ %3 = and i32 %2, 1
+ store i32 %3, ptr %carry_out, align 4
+ %4 = extractvalue { <4 x i32>, i32 } %0, 0
+ ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_s32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
+; CHECK-LABEL: test_vsbcq_s32_carry_in_zero:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
+; CHECK-NEXT: vsbc.i32 q0, q0, q1
+; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT: ubfx r1, r1, #29, #1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+ %1 = extractvalue { <4 x i32>, i32 } %0, 1
+ %2 = lshr i32 %1, 29
+ %3 = and i32 %2, 1
+ store i32 %3, ptr %carry_out, align 4
+ %4 = extractvalue { <4 x i32>, i32 } %0, 0
+ ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_u32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
+; CHECK-LABEL: test_vsbcq_u32_carry_in_zero:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
+; CHECK-NEXT: vsbc.i32 q0, q0, q1
+; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT: ubfx r1, r1, #29, #1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
%1 = extractvalue { <4 x i32>, i32 } %0, 1
@@ -196,7 +236,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_s32(<4 x i32> %inactive, <4 x i3
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
- %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+ %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 1
%4 = lshr i32 %3, 29
%5 = and i32 %4, 1
@@ -215,6 +255,54 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_u32(<4 x i32> %inactive, <4 x i3
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
+ %3 = extractvalue { <4 x i32>, i32 } %2, 1
+ %4 = lshr i32 %3, 29
+ %5 = and i32 %4, 1
+ store i32 %5, ptr %carry_out, align 4
+ %6 = extractvalue { <4 x i32>, i32 } %2, 0
+ ret <4 x i32> %6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_s32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
+; CHECK-LABEL: test_vsbcq_m_s32_carry_in_zero:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vmsr fpscr_nzcvqc, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vsbct.i32 q0, q1, q2
+; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT: ubfx r1, r1, #29, #1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = zext i16 %p to i32
+ %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+ %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+ %3 = extractvalue { <4 x i32>, i32 } %2, 1
+ %4 = lshr i32 %3, 29
+ %5 = and i32 %4, 1
+ store i32 %5, ptr %carry_out, align 4
+ %6 = extractvalue { <4 x i32>, i32 } %2, 0
+ ret <4 x i32> %6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_u32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
+; CHECK-LABEL: test_vsbcq_m_u32_carry_in_zero:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: vmsr p0, r1
+; CHECK-NEXT: vmsr fpscr_nzcvqc, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vsbct.i32 q0, q1, q2
+; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT: ubfx r1, r1, #29, #1
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
index 7fe8e94589a0c5..8a0134a4c5ff29 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
@@ -59,7 +59,7 @@ define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x
; CHECK-NEXT: pop.w {r7, lr}
; CHECK-NEXT: b use_int32x4_t
entry:
- %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+ %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 536870912)
%carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
%result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
tail call void @use_int32x4_t(<4 x i32> %result_low)
``````````
</details>
https://github.com/llvm/llvm-project/pull/118284
More information about the cfe-commits
mailing list