[llvm] [AArch64][CodeGen] Fix wrong operand order when creating vcmla intrinsic (PR #65278)

Tue Sep 5 03:22:00 PDT 2023

daisy202309 wrote:

> Please note in the commit message whether this is actually a visible miscompile; at first glance, multiplication is commutative.
> 
> Assuming it is a miscompile, please verify that using the intrinsics from clang produces the correct result.
> 
> Otherwise LGTM

For the fcmla instruction with the rot value of 90 or 270, the results obtained by exchanging the positions of the last two parameters are different.

Here is an example:
```
#include<arm_neon.h>
#include<stdio.h>

float64x2_t test_rot0(float64_t *acc, float64_t *lhs, float64_t *rhs) {
  return vcmlaq_f64(vld1q_dup_f64(acc), vld1q_dup_f64(lhs), vld1q_dup_f64(rhs));
}

float64x2_t test_rot90(float64_t *acc, float64_t *lhs, float64_t *rhs) {
  return vcmlaq_rot90_f64(vld1q_dup_f64(acc), vld1q_dup_f64(lhs), vld1q_dup_f64(rhs));
}

float64x2_t test_rot180(float64_t *acc, float64_t *lhs, float64_t *rhs) {
  return vcmlaq_rot180_f64(vld1q_dup_f64(acc), vld1q_dup_f64(lhs), vld1q_dup_f64(rhs));
}

float64x2_t test_rot270(float64_t *acc, float64_t *lhs, float64_t *rhs) {
  return vcmlaq_rot270_f64(vld1q_dup_f64(acc), vld1q_dup_f64(lhs), vld1q_dup_f64(rhs));
}

int main() {
  float64_t acc[] = {100, 100};
  float64_t lhs[] = {10, 10};
  float64_t rhs[] = {77, 77};
  float64_t r0[] = {1000, 1000};
  float64_t r90[] = {1000, 1000};
  float64_t r180[] = {1000, 1000};
  float64_t r270[] = {1000, 1000};

  vst1q_lane_f64(r0, test_rot0(acc, lhs, rhs) , 0);
  vst1q_lane_f64(&r0[1], test_rot0(acc, rhs, lhs), 1);

  vst1q_lane_f64(r90, test_rot90(acc, lhs, rhs) , 0);
  vst1q_lane_f64(&r90[1], test_rot90(acc, rhs, lhs), 1);

  vst1q_lane_f64(r180, test_rot180(acc, lhs, rhs) , 0);
  vst1q_lane_f64(&r180[1], test_rot180(acc, rhs, lhs), 1);

  vst1q_lane_f64(r270, test_rot270(acc, lhs, rhs) , 0);
  vst1q_lane_f64(&r270[1], test_rot270(acc, rhs, lhs), 1);

  printf("r0: %lf %lf\n", r0[0], r0[1]);
  printf("r90: %lf %lf\n", r90[0], r90[1]);
  printf("r180: %lf %lf\n", r180[0], r180[1]);
  printf("r270: %lf %lf\n", r270[0], r270[1]);
}
```
The result:
```
r0: 870.000000 870.000000
r90: -670.000000 870.000000
r180: -670.000000 -670.000000
r270: 870.000000 -670.000000
```

https://github.com/llvm/llvm-project/pull/65278