[llvm] [HEXAGON] Fix corner cases for hwloops pass (PR #135439)

via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 11 17:42:57 PDT 2025


https://github.com/aankit-quic updated https://github.com/llvm/llvm-project/pull/135439

>From d13e1915c3c73c3bec4e2f908b86917e9d52bd0c Mon Sep 17 00:00:00 2001
From: aankit-quic <aankit at quicinc.com>
Date: Tue, 1 Apr 2025 10:57:14 -0700
Subject: [PATCH 1/2] [HEXAGON] Fix corner cases for hwloops pass

Add check to make sure Dist > 0 or Dist < 0 for appropriate cmp
cases to hexagon hardware loops pass. The change modifies the
HexagonHardwareLoops pass to add runtime checks to make sure that
end_value > initial_value for less than comparisons and
end_value < initial_value for greater than comparisons.

Change-Id: Ie4b3666ecf69b7aebeb6cfaa48535063677f929c
---
 .../Target/Hexagon/HexagonHardwareLoops.cpp   |  46 ++-
 .../CodeGen/Hexagon/hwloop-dist-check.mir     | 281 ++++++++++++++++++
 llvm/test/CodeGen/Hexagon/swp-phi-start.ll    |   5 +-
 3 files changed, 329 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/hwloop-dist-check.mir

diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 743991447ad1f..67fd03f7ba376 100644
--- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -731,6 +731,11 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
                                                Register IVReg,
                                                int64_t IVBump,
                                                Comparison::Kind Cmp) const {
+  LLVM_DEBUG(llvm::dbgs() << "Loop: " << *Loop << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "Initial Value: " << *Start << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "End Value: " << *End << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "Inc/Dec Value: " << IVBump << "\n");
+  LLVM_DEBUG(llvm::dbgs() << "Comparison: " << Cmp << "\n");
   // Cannot handle comparison EQ, i.e. while (A == B).
   if (Cmp == Comparison::EQ)
     return nullptr;
@@ -846,6 +851,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   if (IVBump < 0) {
     std::swap(Start, End);
     IVBump = -IVBump;
+    std::swap(CmpLess, CmpGreater);
   }
   // Cmp may now have a wrong direction, e.g.  LEs may now be GEs.
   // Signedness, and "including equality" are preserved.
@@ -989,7 +995,45 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
     CountSR = 0;
   }
 
-  return new CountValue(CountValue::CV_Register, CountR, CountSR);
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  Register MuxR = CountR;
+  unsigned MuxSR = CountSR;
+  // For the loop count to be valid unsigned number, CmpLess should imply
+  // Dist >= 0. Similarly, CmpGreater should imply Dist < 0. We can skip the
+  // check if the initial distance is zero and the comparison is LTu || LTEu.
+  if (!(Start->isImm() && StartV == 0 && Comparison::isUnsigned(Cmp) &&
+        CmpLess) &&
+      (CmpLess || CmpGreater)) {
+    // Generate:
+    //   DistCheck = CMP_GT DistR,  0   --> CmpLess
+    //   DistCheck = CMP_GT DistR, -1   --> CmpGreater
+    Register DistCheckR = MRI->createVirtualRegister(PredRC);
+    const MCInstrDesc &DistCheckD = TII->get(Hexagon::C2_cmpgti);
+    BuildMI(*PH, InsertPos, DL, DistCheckD, DistCheckR)
+        .addReg(DistR, 0, DistSR)
+        .addImm((CmpLess) ? 0 : -1);
+
+    // Generate:
+    //   MUXR = MUX DistCheck, CountR, 1   --> CmpLess
+    //   MUXR = MUX DistCheck, 1, CountR   --> CmpGreater
+    MuxR = MRI->createVirtualRegister(IntRC);
+    if (CmpLess) {
+      const MCInstrDesc &MuxD = TII->get(Hexagon::C2_muxir);
+      BuildMI(*PH, InsertPos, DL, MuxD, MuxR)
+          .addReg(DistCheckR)
+          .addReg(CountR, 0, CountSR)
+          .addImm(1);
+    } else {
+      const MCInstrDesc &MuxD = TII->get(Hexagon::C2_muxri);
+      BuildMI(*PH, InsertPos, DL, MuxD, MuxR)
+          .addReg(DistCheckR)
+          .addImm(1)
+          .addReg(CountR, 0, CountSR);
+    }
+    MuxSR = 0;
+  }
+
+  return new CountValue(CountValue::CV_Register, MuxR, MuxSR);
 }
 
 /// Return true if the operation is invalid within hardware loop.
diff --git a/llvm/test/CodeGen/Hexagon/hwloop-dist-check.mir b/llvm/test/CodeGen/Hexagon/hwloop-dist-check.mir
new file mode 100644
index 0000000000000..f5286d6e86fc1
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/hwloop-dist-check.mir
@@ -0,0 +1,281 @@
+# RUN: llc -run-pass=hwloops %s -o - | FileCheck %s
+
+# CHECK-LABEL: name: f
+# CHECK: [[R1:%[0-9]+]]:predregs = C2_cmpgti [[R0:%[0-9]+]], 0
+# CHECK: [[R3:%[0-9]+]]:intregs = C2_muxir [[R1:%[0-9]+]], [[R2:%[0-9]+]], 1
+# CHECK-LABEL: name: g
+# CHECK: [[R1:%[0-9]+]]:predregs = C2_cmpgti [[R0:%[0-9]+]], 0
+# CHECK: [[R3:%[0-9]+]]:intregs = C2_muxir [[R1:%[0-9]+]], [[R2:%[0-9]+]], 1
+--- |
+  @a = dso_local global [255 x ptr] zeroinitializer, align 8
+
+  ; Function Attrs: minsize nofree norecurse nosync nounwind optsize memory(write, argmem: none, inaccessiblemem: none)
+  define dso_local void @f(i32 noundef %m) local_unnamed_addr #0 {
+  entry:
+    %cond = tail call i32 @llvm.smax.i32(i32 %m, i32 2)
+    %0 = add nsw i32 %cond, -4
+    %1 = shl i32 %cond, 3
+    %cgep = getelementptr i8, ptr @a, i32 %1
+    %cgep36 = bitcast ptr @a to ptr
+    br label %do.body
+
+  do.body:                                          ; preds = %do.body, %entry
+    %lsr.iv1 = phi ptr [ %cgep4, %do.body ], [ %cgep, %entry ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %0, %entry ]
+    %sh.0 = phi i32 [ 256, %entry ], [ %shr, %do.body ]
+    %shr = lshr i32 %sh.0, 1
+    %cgep5 = getelementptr inbounds [255 x ptr], ptr %cgep36, i32 0, i32 %shr
+    store ptr %lsr.iv1, ptr %cgep5, align 4, !tbaa !5
+    %lsr.iv.next = add nsw i32 %lsr.iv, 4
+    %cmp1 = icmp samesign ult i32 %lsr.iv.next, 1073741836
+    %cgep4 = getelementptr i8, ptr %lsr.iv1, i32 32
+    br i1 %cmp1, label %do.body, label %do.end, !llvm.loop !9
+
+  do.end:                                           ; preds = %do.body
+    ret void
+  }
+
+  ; Function Attrs: minsize nofree norecurse nosync nounwind optsize memory(write, argmem: none, inaccessiblemem: none)
+  define dso_local void @g(i32 noundef %m) local_unnamed_addr #0 {
+  entry:
+    %0 = add i32 %m, -4
+    %1 = shl i32 %m, 3
+    %cgep = getelementptr i8, ptr @a, i32 %1
+    %cgep36 = bitcast ptr @a to ptr
+    br label %do.body
+
+  do.body:                                          ; preds = %do.body, %entry
+    %lsr.iv1 = phi ptr [ %cgep4, %do.body ], [ %cgep, %entry ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %do.body ], [ %0, %entry ]
+    %sh.0 = phi i32 [ 256, %entry ], [ %shr, %do.body ]
+    %shr = lshr i32 %sh.0, 1
+    %cgep5 = getelementptr inbounds [255 x ptr], ptr %cgep36, i32 0, i32 %shr
+    store ptr %lsr.iv1, ptr %cgep5, align 4, !tbaa !5
+    %lsr.iv.next = add i32 %lsr.iv, 4
+    %cmp = icmp slt i32 %lsr.iv.next, 1073741836
+    %cgep4 = getelementptr i8, ptr %lsr.iv1, i32 32
+    br i1 %cmp, label %do.body, label %do.end, !llvm.loop !11
+
+  do.end:                                           ; preds = %do.body
+    ret void
+  }
+
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare i32 @llvm.smax.i32(i32, i32) #1
+
+  attributes #0 = { minsize nofree norecurse nosync nounwind optsize memory(write, argmem: none, inaccessiblemem: none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" }
+  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+  !llvm.module.flags = !{!0, !1, !2, !3}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 8, !"PIC Level", i32 2}
+  !2 = !{i32 7, !"PIE Level", i32 2}
+  !3 = !{i32 7, !"frame-pointer", i32 2}
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"any pointer", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+  !9 = distinct !{!9, !10}
+  !10 = !{!"llvm.loop.mustprogress"}
+  !11 = distinct !{!11, !10}
+
+...
+---
+name:            f
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 1, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 2, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 3, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 4, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 5, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 6, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 7, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 8, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 9, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 10, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 11, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 12, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 13, class: predregs, preferred-register: '', flags: [  ] }
+  - { id: 14, class: predregs, preferred-register: '', flags: [  ] }
+  - { id: 15, class: intregs, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$r0', virtual-reg: '%9' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0
+
+    %9:intregs = COPY $r0
+    %11:intregs = A2_tfrsi 2
+    %12:intregs = A2_max %9, %11
+    %0:intregs = nsw A2_addi %12, -4
+    %1:intregs = S4_addi_asl_ri @a, %12, 3
+    %2:intregs = A2_tfrsi @a
+    %10:intregs = A2_tfrsi 256
+
+  bb.1.do.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+
+    %3:intregs = PHI %1, %bb.0, %8, %bb.1
+    %4:intregs = PHI %0, %bb.0, %7, %bb.1
+    %5:intregs = PHI %10, %bb.0, %15, %bb.1
+    %15:intregs = S2_extractu %5, 8, 1
+    S4_storeri_rr %2, %15, 2, %3 :: (store (s32) into %ir.cgep5, !tbaa !5)
+    %7:intregs = nsw A2_addi %4, 4
+    %13:predregs = C2_cmpgtui %7, 1073741835
+    %8:intregs = A2_addi %3, 32
+    J2_jumpf %13, %bb.1, implicit-def dead $pc
+    J2_jump %bb.2, implicit-def dead $pc
+
+  bb.2.do.end:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
+---
+name:            g
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 1, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 2, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 3, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 4, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 5, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 6, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 7, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 8, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 9, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 10, class: intregs, preferred-register: '', flags: [  ] }
+  - { id: 11, class: predregs, preferred-register: '', flags: [  ] }
+  - { id: 12, class: predregs, preferred-register: '', flags: [  ] }
+  - { id: 13, class: intregs, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$r0', virtual-reg: '%9' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0
+
+    %9:intregs = COPY $r0
+    %0:intregs = A2_addi %9, -4
+    %1:intregs = S4_addi_asl_ri @a, %9, 3
+    %2:intregs = A2_tfrsi @a
+    %10:intregs = A2_tfrsi 256
+
+  bb.1.do.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+
+    %3:intregs = PHI %1, %bb.0, %8, %bb.1
+    %4:intregs = PHI %0, %bb.0, %7, %bb.1
+    %5:intregs = PHI %10, %bb.0, %13, %bb.1
+    %13:intregs = S2_extractu %5, 8, 1
+    S4_storeri_rr %2, %13, 2, %3 :: (store (s32) into %ir.cgep5, !tbaa !5)
+    %7:intregs = A2_addi %4, 4
+    %11:predregs = C2_cmpgti %7, 1073741835
+    %8:intregs = A2_addi %3, 32
+    J2_jumpf %11, %bb.1, implicit-def dead $pc
+    J2_jump %bb.2, implicit-def dead $pc
+
+  bb.2.do.end:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-phi-start.ll b/llvm/test/CodeGen/Hexagon/swp-phi-start.ll
index 52c258656ec22..6c2b08d83b1c7 100644
--- a/llvm/test/CodeGen/Hexagon/swp-phi-start.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-phi-start.ll
@@ -5,8 +5,9 @@
 ; the same stage.
 
 ; CHECK-DAG: [[REG3:(r[0-9]+)]] = add([[REG1:(r[0-9]+)]],#-1)
-; CHECK-DAG: [[REG2:(r[0-9]+)]] = add([[REG1]],#-1)
-; CHECK-DAG: loop0(.LBB0_[[LOOP:.]],[[REG3]])
+; CHECK-DAG: [[REG2:(r[0-9]+)]] = add([[REG4:(r[0-9]+)]],#-1)
+; CHECK-DAG: loop0(.LBB0_[[LOOP:.]],[[REG2]])
+; CHECK-NOT: = [[REG3]]
 ; CHECK-NOT: = [[REG2]]
 ; CHECK: .LBB0_[[LOOP]]:
 ; CHECK: }{{[ \t]*}}:endloop

>From f80d13c1c50195c44ecbf0de1c148312f540f406 Mon Sep 17 00:00:00 2001
From: aankit-quic <aankit at quicinc.com>
Date: Fri, 11 Apr 2025 17:42:22 -0700
Subject: [PATCH 2/2] Fix lit test

Change-Id: I2fd135a0086db33ab0fd1520d88d38dbadde884a
---
 llvm/test/CodeGen/Hexagon/hwloop-dist-check.mir | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/Hexagon/hwloop-dist-check.mir b/llvm/test/CodeGen/Hexagon/hwloop-dist-check.mir
index f5286d6e86fc1..9f8c14a314309 100644
--- a/llvm/test/CodeGen/Hexagon/hwloop-dist-check.mir
+++ b/llvm/test/CodeGen/Hexagon/hwloop-dist-check.mir
@@ -1,4 +1,4 @@
-# RUN: llc -run-pass=hwloops %s -o - | FileCheck %s
+# RUN: llc --mtriple=hexagon -run-pass=hwloops %s -o - | FileCheck %s
 
 # CHECK-LABEL: name: f
 # CHECK: [[R1:%[0-9]+]]:predregs = C2_cmpgti [[R0:%[0-9]+]], 0
@@ -63,11 +63,7 @@
   ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
   declare i32 @llvm.smax.i32(i32, i32) #1
 
-  attributes #0 = { minsize nofree norecurse nosync nounwind optsize memory(write, argmem: none, inaccessiblemem: none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" }
-  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-
   !llvm.module.flags = !{!0, !1, !2, !3}
-
   !0 = !{i32 1, !"wchar_size", i32 4}
   !1 = !{i32 8, !"PIC Level", i32 2}
   !2 = !{i32 7, !"PIE Level", i32 2}



More information about the llvm-commits mailing list