[clang] b2ec416 - [ARM] Fix NaN behaviour for MVE compare intrinsics (#116371)

Tue Nov 19 02:23:18 PST 2024

Author: Oliver Stannard
Date: 2024-11-19T10:23:14Z
New Revision: b2ec416aa5bcd89c4bc295163d60e5a2ecb99212

URL: https://github.com/llvm/llvm-project/commit/b2ec416aa5bcd89c4bc295163d60e5a2ecb99212
DIFF: https://github.com/llvm/llvm-project/commit/b2ec416aa5bcd89c4bc295163d60e5a2ecb99212.diff

LOG: [ARM] Fix NaN behaviour for MVE compare intrinsics (#116371)

The MVE intrinsics are defined as having the same behaviour as the
instructions which they correspond to. In particular, the vcmpleq and
vcmpltq intrinsics correspond to the VCMP instruction with the LE or LT
condition. However, these instructions with these two conditions do not
match the normal IEEE754 behaviour for NaNs, they return true if either
operand is a NaN, instead of false. Therefore we need to generate `fcmp`
IR instructions with the `ule` and `ult` conditions, instead of `ole`
and `olt`.

This differs from AdvSIMD, where only instructions with the EQ, GE and
GT conditions are available, and the intrinsics for the others are
defined by swapping the condition and operand order, so the results
match the IEEE754 behaviour for NaNs.

Added: 
    

Modified: 
    clang/include/clang/Basic/arm_mve.td
    clang/include/clang/Basic/arm_mve_defs.td
    clang/test/CodeGen/arm-mve-intrinsics/compare.c

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 1debb94a0a7b81..93abbc47c54dd5 100644

--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -753,8 +753,8 @@ let params = T.Float in {
   defm: compare<"ne", fcmp_ne>;
   defm: compare<"gt", fcmp_gt>;
   defm: compare<"ge", fcmp_ge>;
-  defm: compare<"lt", fcmp_lt>;
-  defm: compare<"le", fcmp_le>;
+  defm: compare<"lt", fcmp_ult>;
+  defm: compare<"le", fcmp_ule>;
 }
 
 let params = T.Signed in {

diff  --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index 1a090c08cc8531..634b4e9c2c9c8a 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -116,8 +116,8 @@ def fcmp_eq: IRBuilder<"CreateFCmpOEQ">;
 def fcmp_ne: IRBuilder<"CreateFCmpUNE">; // not O: it must return true on NaNs
 def fcmp_gt: IRBuilder<"CreateFCmpOGT">;
 def fcmp_ge: IRBuilder<"CreateFCmpOGE">;
-def fcmp_lt: IRBuilder<"CreateFCmpOLT">;
-def fcmp_le: IRBuilder<"CreateFCmpOLE">;
+def fcmp_ult: IRBuilder<"CreateFCmpULT">;
+def fcmp_ule: IRBuilder<"CreateFCmpULE">;
 def splat: CGHelperFn<"ARMMVEVectorSplat">;
 def select: IRBuilder<"CreateSelect">;
 def fneg: IRBuilder<"CreateFNeg">;

diff  --git a/clang/test/CodeGen/arm-mve-intrinsics/compare.c b/clang/test/CodeGen/arm-mve-intrinsics/compare.c
index 8f190990a65869..8886cf5c100581 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/compare.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/compare.c
@@ -2376,7 +2376,7 @@ mve_pred16_t test_vcmphiq_m_n_u32(uint32x4_t a, uint32_t b, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vcmpleq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2392,7 +2392,7 @@ mve_pred16_t test_vcmpleq_f16(float16x8_t a, float16x8_t b)
 
 // CHECK-LABEL: @test_vcmpleq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2458,7 +2458,7 @@ mve_pred16_t test_vcmpleq_s32(int32x4_t a, int32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2476,7 +2476,7 @@ mve_pred16_t test_vcmpleq_n_f16(float16x8_t a, float16_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2548,7 +2548,7 @@ mve_pred16_t test_vcmpleq_n_s32(int32x4_t a, int32_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2567,7 +2567,7 @@ mve_pred16_t test_vcmpleq_m_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2645,7 +2645,7 @@ mve_pred16_t test_vcmpleq_m_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2666,7 +2666,7 @@ mve_pred16_t test_vcmpleq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2746,7 +2746,7 @@ mve_pred16_t test_vcmpleq_m_n_s32(int32x4_t a, int32_t b, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vcmpltq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2762,7 +2762,7 @@ mve_pred16_t test_vcmpltq_f16(float16x8_t a, float16x8_t b)
 
 // CHECK-LABEL: @test_vcmpltq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2828,7 +2828,7 @@ mve_pred16_t test_vcmpltq_s32(int32x4_t a, int32x4_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2846,7 +2846,7 @@ mve_pred16_t test_vcmpltq_n_f16(float16x8_t a, float16_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 // CHECK-NEXT:    ret i16 [[TMP2]]
@@ -2918,7 +2918,7 @@ mve_pred16_t test_vcmpltq_n_s32(int32x4_t a, int32_t b)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -2937,7 +2937,7 @@ mve_pred16_t test_vcmpltq_m_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[B:%.*]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -3015,7 +3015,7 @@ mve_pred16_t test_vcmpltq_m_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x half> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <8 x half> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
@@ -3036,7 +3036,7 @@ mve_pred16_t test_vcmpltq_m_n_f16(float16x8_t a, float16_t b, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[A:%.*]], [[DOTSPLAT]]
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x float> [[A:%.*]], [[DOTSPLAT]]
 // CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16