[PATCH] D99772: [AMDGPU] Check for NaN when folding output modifiers

Thu Apr 1 15:00:44 PDT 2021

bcahoon created this revision.
bcahoon added a reviewer: arsenm.
Herald added subscribers: kerbowa, hiraditya, t-tye, tpr, dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl.
bcahoon requested review of this revision.
Herald added subscribers: llvm-commits, wdng.
Herald added a project: LLVM.

Hardware ignores output modifiers if the IEEE mode is set. If the
no NaN flag is set on an instruction, then the output modifier
folding optimization can assume that the IEEE mode is not set.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D99772

Files:
  llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
  llvm/test/CodeGen/AMDGPU/omod.ll


Index: llvm/test/CodeGen/AMDGPU/omod.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/omod.ll
+++ llvm/test/CodeGen/AMDGPU/omod.ll
@@ -105,6 +105,15 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}v_omod_div2_f64_nnan:
+; GCN: v_add_f64  v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 div:2{{$}}
+define amdgpu_kernel void @v_omod_div2_f64_nnan(double %a) #5 {
+  %add = fadd nnan nsz double %a, 1.0
+  %div2 = fmul nnan nsz double %add, 0.5
+  store double %div2, double addrspace(1)* undef
+  ret void
+}
+
 ; GCN-LABEL: {{^}}v_omod_mul2_f32:
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}}
 define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
@@ -123,6 +132,15 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}v_omod_mul2_f64_nnan:
+; GCN: v_add_f64  v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 mul:2{{$}}
+define amdgpu_kernel void @v_omod_mul2_f64_nnan(double %a) #5 {
+  %add = fadd nnan nsz double %a, 1.0
+  %div2 = fmul nnan nsz double %add, 2.0
+  store double %div2, double addrspace(1)* undef
+  ret void
+}
+
 ; GCN-LABEL: {{^}}v_omod_mul4_f32:
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
 define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
@@ -141,6 +159,15 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}v_omod_mul4_f64_nnan:
+; GCN: v_add_f64  v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 mul:4{{$}}
+define amdgpu_kernel void @v_omod_mul4_f64_nnan(double %a) #5 {
+  %add = fadd nnan nsz double %a, 1.0
+  %div2 = fmul nnan nsz double %add, 4.0
+  store double %div2, double addrspace(1)* undef
+  ret void
+}
+
 ; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32:
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}}
Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1771,10 +1771,12 @@
         if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
           CurrentKnownM0Val = nullptr;
 
+        // The IEEE mode value can be overridden, which is detected by checking
+        // the NaN flag on the instruction.
         // TODO: Omod might be OK if there is NSZ only on the source
         // instruction, and not the omod multiply.
-        if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
-            !tryFoldOMod(MI))
+        if ((IsIEEEMode && !MI.getFlag(MachineInstr::FmNoNans)) ||
+            (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || !tryFoldOMod(MI))
           tryFoldClamp(MI);
 
         continue;


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D99772.334830.patch
Type: text/x-patch
Size: 2645 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210401/b6b20bf0/attachment.bin>