[llvm] [AArch64][CostModel] Add NFC tests for extractelement cost (PR #108941)

Tue Sep 17 01:56:07 PDT 2024

https://github.com/sushgokh updated https://github.com/llvm/llvm-project/pull/108941

>From e81c333d96538c2156084ad2ae562a43ab07e8a9 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Tue, 17 Sep 2024 14:18:35 +0530
Subject: [PATCH] [AArch64][CostModel] Add NFC tests for extractelement cost

A successive patch aims to reduce the extractelement cost where the only user(s) is fmul instruction.
---
 .../CostModel/AArch64/extract_float.ll        | 205 ++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/extract_float.ll

diff --git a/llvm/test/Analysis/CostModel/AArch64/extract_float.ll b/llvm/test/Analysis/CostModel/AArch64/extract_float.ll
new file mode 100644
index 00000000000000..dd3d0289bbb1cf
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/extract_float.ll
@@ -0,0 +1,205 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-unknown-linux \
+; RUN:       -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-unknown-linux \
+; RUN:       -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
+
+; res = lane 0 * lane 1
+define double @extract_case1(<2 x double> %a) {
+; CHECK-LABEL: 'extract_case1'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
+entry:
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %a, i32 1
+  %res = fmul double %1, %2
+  ret double %res
+}
+
+; res = lane 1 * lane 1
+define double @extract_case2(<2 x double> %a) {
+; CHECK-LABEL: 'extract_case2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %0 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
+entry:
+  %1 = extractelement <2 x double> %a, i32 1
+  %res = fmul double %1, %1
+  ret double %res
+}
+
+; res = lane 0 * lane 0
+define double @extract_case3(<2 x double> %a) {
+; CHECK-LABEL: 'extract_case3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
+entry:
+  %1 = extractelement <2 x double> %a, i32 0
+  %res = fmul double %1, %1
+  ret double %res
+}
+
+; res = lane 0 * scalar
+define double @extract_case4(<2 x double> %a, double %b) {
+; CHECK-LABEL: 'extract_case4'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
+entry:
+  %1 = extractelement <2 x double> %a, i32 0
+  %res = fmul double %1, %b
+  ret double %res
+}
+
+; res = lane 1 * scalar
+define double @extract_case5(<2 x double> %a, double %b) {
+; CHECK-LABEL: 'extract_case5'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %0 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %b
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
+entry:
+  %1 = extractelement <2 x double> %a, i32 1
+  %res = fmul double %1, %b
+  ret double %res
+}
+
+; Input vector = <3 x double> (i.e. odd length vector)
+; res = lane 0 * lane 1
+define double @extract_case6(<3 x double> %a) {
+; CHECK-LABEL: 'extract_case6'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <3 x double> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <3 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
+entry:
+  %1 = extractelement <3 x double> %a, i32 0
+  %2 = extractelement <3 x double> %a, i32 1
+  %res = fmul double %1, %2
+  ret double %res
+}
+
+; res = lane 1 * lane 2
+; Extract from lane 2 is equivalent to extract from lane 0 of other 128-bit
+; register. But for other register sizes, this is not the case.
+define double @extract_case7(<4 x double> %a) {
+; CHECK-LABEL: 'extract_case7'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %0 = extractelement <4 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <4 x double> %a, i32 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
+entry:
+  %1 = extractelement <4 x double> %a, i32 1
+  %2 = extractelement <4 x double> %a, i32 2
+  %res = fmul double %1, %2
+  ret double %res
+}
+
+; res = lane 0 * lane 1
+; Additional insert of extract from lane 1.
+define double @extract_case8(<2 x double> %a) {
+; CHECK-LABEL: 'extract_case8'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %2 = insertelement <2 x double> %a, double %1, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> %2)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = fmul double %0, %1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = fmul double %3, %4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %5
+entry:
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %a, i32 1
+  %3 = insertelement <2 x double> %a, double %2, i32 0
+  %4 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> %3)
+  %5 = fmul double %1, %2
+  %6 = fmul double %4, %5
+  ret double %6
+}
+
+; res = lane 0 * lane 1
+; Additional insert of extract from lane 1.
+define double @extract_case9(<2 x double> %a) {
+; CHECK-LABEL: 'extract_case9'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %2 = insertelement <2 x double> %a, double %1, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %3 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> %2)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = fmul double %0, %1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = fmul double %3, %4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %5
+entry:
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %a, i32 1
+  %3 = insertelement <2 x double> %a, double %2, i32 0
+  %4 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> %3)
+  %5 = fmul double %1, %2
+  %6 = fmul double %4, %5
+  ret double %6
+}
+
+; res = lane 0 * lane 1
+; Extract from lane 1 passed as function param.
+define double @extract_case10(<4 x double> %a) {
+; CHECK-LABEL: 'extract_case10'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <4 x double> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <4 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @foo(double %1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = fmul double %0, %1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %2
+entry:
+  %1 = extractelement <4 x double> %a, i32 0
+  %2 = extractelement <4 x double> %a, i32 1
+  call void @foo(double %2)
+  %3 = fmul double %1, %2
+  ret double %3
+}
+
+; res = lane 0 * lane 1
+define half @extract_case11(<2 x half> %a) {
+; CHECK-LABEL: 'extract_case11'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x half> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret half %res
+entry:
+  %1 = extractelement <2 x half> %a, i32 0
+  %2 = extractelement <2 x half> %a, i32 1
+  %res = fmul half %1, %2
+  ret half %res
+}
+
+; res = lane 0 * lane 1
+define float @extract_case12(<2 x float> %a) {
+; CHECK-LABEL: 'extract_case12'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x float> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x float> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul float %0, %1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %res
+entry:
+  %1 = extractelement <2 x float> %a, i32 0
+  %2 = extractelement <2 x float> %a, i32 1
+  %res = fmul float %1, %2
+  ret float %res
+}
+
+; res = lane 0 + lane 1
+; Use of bin-op other than fmul.
+define double @extract_case13(<2 x double> %a) {
+; CHECK-LABEL: 'extract_case13'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = fadd double %0, %1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
+entry:
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %a, i32 1
+  %res = fadd double %1, %2
+  ret double %res
+}
+
+declare void @foo(double)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FULLFP16: {{.*}}
+; NOFP16: {{.*}}