[llvm] [AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 (PR #132089)

Wed Mar 26 08:02:16 PDT 2025

================
@@ -1,41 +1,35 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
-# XFAIL: *
-# FIXME-TRUE16 reenable after fix-sgpr-copies is updated for true16 flow
 
 ---
-name:            cmp_f16
+name:            cvt_hi_f32_f16
 body:             |
-  bb.0.entry:
-    ; GCN-LABEL: name: cmp_f16
+  bb.0:
+    ; GCN-LABEL: name: cvt_hi_f32_f16
     ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
-    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
-    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY killed [[COPY]]
-    ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[COPY1]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
+    ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[SUBREG_TO_REG]]
+    ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY]].hi16, 0, 0, 0, implicit $mode, implicit $exec
     %0:vgpr_16 = IMPLICIT_DEF
-    %1:sreg_32 = IMPLICIT_DEF
-    %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
-    %3:sreg_32 = COPY %2:vgpr_16
-    nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
-    %4:sreg_32_xm0_xexec = COPY $scc
-    %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
+    %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_16
+    %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode
 ...
 
 ---
-name:            cvt_hi_f32_f16
+name:            s_xor_b32
 body:             |
   bb.0:
-    ; GCN-LABEL: name: cvt_hi_f32_f16
+    ; GCN-LABEL: name: s_xor_b32
     ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
-    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-    ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY1]].hi16, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
+    ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[SUBREG_TO_REG]], [[SUBREG_TO_REG]], implicit $exec
+    ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[V_OR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
     %0:vgpr_16 = IMPLICIT_DEF
     %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
     %2:sreg_32 = COPY %1:vgpr_16
-    %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode
+    %3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
----------------
Sisyph wrote:

test is called xor but the instruction used is 'or'

https://github.com/llvm/llvm-project/pull/132089