[llvm] [NVPTX] Improve folding to mad with immediate 1 (PR #93628)

Tue May 28 18:57:05 PDT 2024

================
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O1 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -O1 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -O1 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -O1 | %ptxas-verify %}
+
+define i32 @test1(i32 %n, i32 %m) {
+;
+; CHECK: ld.param.u32   %[[N:r[0-9]+]], [test1_param_0];
+; CHECK: ld.param.u32   %[[M:r[0-9]+]], [test1_param_1];
+; CHECK: mad.lo.s32     %[[MAD:r[0-9]+]], %[[M]], %[[N]], %[[M]];
+; CHECK: st.param.b32   [func_retval0+0], %[[MAD]];
+;
+  %add = add i32 %n, 1
+  %mul = mul i32 %add, %m
+  ret i32 %mul
+}
+
+define i32 @test1_rev(i32 %n, i32 %m) {
+;
+; CHECK: ld.param.u32   %[[N:r[0-9]+]], [test1_rev_param_0];
+; CHECK: ld.param.u32   %[[M:r[0-9]+]], [test1_rev_param_1];
+; CHECK: mad.lo.s32     %[[MAD:r[0-9]+]], %[[M]], %[[N]], %[[M]];
+; CHECK: st.param.b32   [func_retval0+0], %[[MAD]];
+;
+  %add = add i32 %n, 1
+  %mul = mul i32 %m, %add
+  ret i32 %mul
+}
+
+; Transpose (mul (select)) if it can then be folded to mad
----------------
Artem-B wrote:

Does it buy us anything?

`mul(m,select(1,n))`  will probably have the same performance as `select(mul(m,n), m)` as the critical path will always have `mul` and `select`, just in different order.

https://github.com/llvm/llvm-project/pull/93628