[llvm] [NVPTX] support packed f32 instructions for sm_100+ (PR #126337)

Princeton Ferro via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 13 13:43:39 PST 2025


================
@@ -0,0 +1,2665 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; ## Full FP32x2 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100                      \
+; RUN:         -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs   \
+; RUN: | FileCheck --check-prefixes=CHECK-O0 %s
+; RUN: %if ptxas %{                                                            \
+; RUN:  llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100                     \
+; RUN:           -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN:  | %ptxas-verify -arch=sm_100                                           \
+; RUN: %}
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100                      \
+; RUN:         -O3 -verify-machineinstrs                                       \
+; RUN: | FileCheck --check-prefixes=CHECK-O3 %s
+; RUN: %if ptxas %{                                                            \
+; RUN:  llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100                     \
+; RUN:           -O3 -verify-machineinstrs                                     \
+; RUN:  | %ptxas-verify -arch=sm_100                                           \
+; RUN: %}
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "nvptx64-nvidia-cuda"
+
+define <2 x float> @test_ret_const() #0 {
+;
+; CHECK-O0-LABEL: test_ret_const(
+; CHECK-O0:       {
+; CHECK-O0-NEXT:    .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT:  // %bb.0:
+; CHECK-O0-NEXT:    mov.f32 %f1, 0f40000000;
+; CHECK-O0-NEXT:    mov.f32 %f2, 0f3F800000;
+; CHECK-O0-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O0-NEXT:    ret;
+;
+; CHECK-O3-LABEL: test_ret_const(
+; CHECK-O3:       {
+; CHECK-O3-NEXT:    .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT:  // %bb.0:
+; CHECK-O3-NEXT:    mov.f32 %f1, 0f40000000;
+; CHECK-O3-NEXT:    mov.f32 %f2, 0f3F800000;
+; CHECK-O3-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-O3-NEXT:    ret;
+  ret <2 x float> <float 1.0, float 2.0>
+}
+
+define float @test_extract_0(<2 x float> %a) #0 {
+;
+; CHECK-O0-LABEL: test_extract_0(
+; CHECK-O0:       {
+; CHECK-O0-NEXT:    .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT:  // %bb.0:
+; CHECK-O0-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_extract_0_param_0];
+; CHECK-O0-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-O0-NEXT:    ret;
+;
+; CHECK-O3-LABEL: test_extract_0(
+; CHECK-O3:       {
+; CHECK-O3-NEXT:    .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT:  // %bb.0:
+; CHECK-O3-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_extract_0_param_0];
+; CHECK-O3-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-O3-NEXT:    ret;
+  %e = extractelement <2 x float> %a, i32 0
+  ret float %e
+}
+
+define float @test_extract_1(<2 x float> %a) #0 {
+;
+; CHECK-O0-LABEL: test_extract_1(
+; CHECK-O0:       {
+; CHECK-O0-NEXT:    .reg .f32 %f<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT:  // %bb.0:
+; CHECK-O0-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_extract_1_param_0];
+; CHECK-O0-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-O0-NEXT:    ret;
+;
+; CHECK-O3-LABEL: test_extract_1(
+; CHECK-O3:       {
+; CHECK-O3-NEXT:    .reg .f32 %f<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT:  // %bb.0:
+; CHECK-O3-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_extract_1_param_0];
+; CHECK-O3-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-O3-NEXT:    ret;
+  %e = extractelement <2 x float> %a, i32 1
+  ret float %e
+}
+
+; NOTE: disabled as -O3 miscompiles this into pointer arithmetic on
+; test_extract_i_param_0 where the symbol's address is not taken first (that
+; is, moved to a temporary)
+; define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
+; ; CHECK-LABEL: test_extract_i(
+; ; CHECK:       {
+; ; CHECK-NEXT:    .reg .pred %p<2>;
+; ; CHECK-NEXT:    .reg .f32 %f<4>;
+; ; CHECK-NEXT:    .reg .b64 %rd<2>;
+; ; CHECK-EMPTY:
+; ; CHECK-NEXT:  // %bb.0:
+; ; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_extract_i_param_0];
+; ; CHECK-NEXT:    ld.param.u64 %rd1, [test_extract_i_param_1];
+; ; CHECK-NEXT:    setp.eq.s64 %p1, %rd1, 0;
+; ; CHECK-NEXT:    selp.f32 %f3, %f1, %f2, %p1;
+; ; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; ; CHECK-NEXT:    ret;
+;   %e = extractelement <2 x float> %a, i64 %idx
+;   ret float %e
+; }
+
+define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
+;
+; CHECK-O0-LABEL: test_fadd(
+; CHECK-O0:       {
+; CHECK-O0-NEXT:    .reg .b32 %r<5>;
+; CHECK-O0-NEXT:    .reg .f32 %f<7>;
+; CHECK-O0-NEXT:    .reg .b64 %rd<10>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT:  // %bb.0:
+; CHECK-O0-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_fadd_param_0];
+; CHECK-O0-NEXT:    mov.b32 %r1, %f1;
+; CHECK-O0-NEXT:    cvt.u64.u32 %rd2, %r1;
+; CHECK-O0-NEXT:    mov.b32 %r2, %f2;
+; CHECK-O0-NEXT:    cvt.u64.u32 %rd3, %r2;
+; CHECK-O0-NEXT:    shl.b64 %rd4, %rd3, 32;
+; CHECK-O0-NEXT:    or.b64 %rd5, %rd2, %rd4;
+; CHECK-O0-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_fadd_param_1];
+; CHECK-O0-NEXT:    mov.b32 %r3, %f3;
+; CHECK-O0-NEXT:    cvt.u64.u32 %rd6, %r3;
+; CHECK-O0-NEXT:    mov.b32 %r4, %f4;
+; CHECK-O0-NEXT:    cvt.u64.u32 %rd7, %r4;
+; CHECK-O0-NEXT:    shl.b64 %rd8, %rd7, 32;
+; CHECK-O0-NEXT:    or.b64 %rd9, %rd6, %rd8;
+; CHECK-O0-NEXT:    add.rn.f32x2 %rd1, %rd5, %rd9;
+; CHECK-O0-NEXT:    mov.b64 {%f5, %f6}, %rd1;
+; CHECK-O0-NEXT:    st.param.v2.f32 [func_retval0], {%f5, %f6};
+; CHECK-O0-NEXT:    ret;
+;
+; CHECK-O3-LABEL: test_fadd(
+; CHECK-O3:       {
+; CHECK-O3-NEXT:    .reg .f32 %f<5>;
+; CHECK-O3-NEXT:    .reg .b64 %rd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT:  // %bb.0:
+; CHECK-O3-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_fadd_param_0];
+; CHECK-O3-NEXT:    mov.b64 %rd2, {%f1, %f2};
+; CHECK-O3-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_fadd_param_1];
+; CHECK-O3-NEXT:    mov.b64 %rd3, {%f3, %f4};
+; CHECK-O3-NEXT:    add.rn.f32x2 %rd4, %rd2, %rd3;
+; CHECK-O3-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-O3-NEXT:    ret;
+  %r = fadd <2 x float> %a, %b
----------------
Prince781 wrote:

> Whether a particular instruction is supported or not should be a property of the target, and should not depend on the optimization level.

Agree in general, but shouldn't we want to check that optimizations may not introduce new instructions that are unhandled? For example, this is currently broken on `-O3`:

https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll#L76-L94

```
% llc < test-extract-i-f16x2.ll -mtriple=nvptx64-nvidia-cuda -O3 -verify-machineinstrs                                                                              :( 134 25-02-13 - 13:42:27
//
// Generated by LLVM NVPTX Back-End
//

.version 6.0
.target sm_30
.address_size 64


# After Instruction Selection
# Machine code for function test_extract_i: IsSSA, TracksLiveness

bb.0 (%ir-block.0):
  %0:int64regs = LD_i64_avar 0, 0, 101, 1, 0, 64, &test_extract_i_param_1 :: (dereferenceable invariant load (s64) from `ptr addrspace(101) null`, addrspace 101)
  %1:int64regs = ANDb64ri killed %0:int64regs, 1
  %2:int64regs = SHLi64ri killed %1:int64regs, 1
  %3:int64regs = ADDi64rr &test_extract_i_param_0, killed %2:int64regs
  %4:int16regs = LD_i16_areg_64 0, 0, 0, 1, 3, 16, killed %3:int64regs :: (dereferenceable invariant load (s16), addrspace 101)
  StoreRetvalI16 killed %4:int16regs, 0 :: (store (s16), align 1)
  Return

# End machine code for function test_extract_i.

*** Bad machine code: Expected a register operand. ***
- function:    test_extract_i
- basic block: %bb.0  (0x459ca9f0)
- instruction: %3:int64regs = ADDi64rr &test_extract_i_param_0, killed %2:int64regs
- operand 1:   &test_extract_i_param_0
LLVM ERROR: Found 1 machine code errors.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.	Program arguments: llc -mtriple=nvptx64-nvidia-cuda -O3 -verify-machineinstrs
1.	Running pass 'Function Pass Manager' on module '<stdin>'.
2.	Running pass 'Verify generated machine code' on function '@test_extract_i'
 #0 0x0000000007e11b9c llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) /home/pferro/repos/llvm-project/llvm/lib/Support/Unix/Signals.inc:804:22
 #1 0x0000000007e11fda PrintStackTraceSignalHandler(void*) /home/pferro/repos/llvm-project/llvm/lib/Support/Unix/Signals.inc:880:1
 #2 0x0000000007e0f792 llvm::sys::RunSignalHandlers() /home/pferro/repos/llvm-project/llvm/lib/Support/Signals.cpp:105:20
 #3 0x0000000007e11546 SignalHandler(int, siginfo_t*, void*) /home/pferro/repos/llvm-project/llvm/lib/Support/Unix/Signals.inc:418:13
 #4 0x00007f0dd0140090 __restore_rt (/lib64/libc.so.6+0x1a090)
 #5 0x00007f0dd01990f4 __pthread_kill_implementation (/lib64/libc.so.6+0x730f4)
 #6 0x00007f0dd013ffde gsignal (/lib64/libc.so.6+0x19fde)
 #7 0x00007f0dd0127942 abort (/lib64/libc.so.6+0x1942)
 #8 0x0000000007d39fd2 llvm::report_fatal_error(llvm::Twine const&, bool) /home/pferro/repos/llvm-project/llvm/lib/Support/ErrorHandling.cpp:126:9
 #9 0x000000000682df5b (anonymous namespace)::MachineVerifier::ReportedErrors::~ReportedErrors() /home/pferro/repos/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp:262:33
#10 0x000000000682e0dc (anonymous namespace)::MachineVerifier::~MachineVerifier() /home/pferro/repos/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp:102:8
#11 0x000000000682e214 (anonymous namespace)::MachineVerifierLegacyPass::runOnMachineFunction(llvm::MachineFunction&) /home/pferro/repos/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp:391:12
#12 0x000000000667682c llvm::MachineFunctionPass::runOnFunction(llvm::Function&) /home/pferro/repos/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp:94:33
#13 0x0000000006ee647f llvm::FPPassManager::runOnFunction(llvm::Function&) /home/pferro/repos/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1406:20
#14 0x0000000006ee66e1 llvm::FPPassManager::runOnModule(llvm::Module&) /home/pferro/repos/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1452:13
#15 0x0000000006ee6aee (anonymous namespace)::MPPassManager::runOnModule(llvm::Module&) /home/pferro/repos/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1521:20
#16 0x0000000006ee21b1 llvm::legacy::PassManagerImpl::run(llvm::Module&) /home/pferro/repos/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:539:13
#17 0x0000000006ee7319 llvm::legacy::PassManager::run(llvm::Module&) /home/pferro/repos/llvm-project/llvm/lib/IR/LegacyPassManager.cpp:1649:1
#18 0x0000000004a37b51 compileModule(char**, llvm::LLVMContext&) /home/pferro/repos/llvm-project/llvm/tools/llc/llc.cpp:753:34
#19 0x0000000004a353f1 main /home/pferro/repos/llvm-project/llvm/tools/llc/llc.cpp:411:35
#20 0x00007f0dd0129248 __libc_start_call_main (/lib64/libc.so.6+0x3248)
#21 0x00007f0dd012930b __libc_start_main at GLIBC_2.2.5 (/lib64/libc.so.6+0x330b)
#22 0x0000000004a33fa5 _start (/home/pferro/repos/llvm-project/llvm/build-Debug/bin/llc+0x4a33fa5)
[1]    2423725 IOT instruction (core dumped)  llc -mtriple=nvptx64-nvidia-cuda -O3 -verify-machineinstrs <
```

https://github.com/llvm/llvm-project/pull/126337


More information about the llvm-commits mailing list