Skip to content

Commit

Permalink
fix(ocr): another fix for #274
Browse files Browse the repository at this point in the history
  • Loading branch information
eroux committed Jun 5, 2024
1 parent e34e662 commit 2a7b401
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions openpecha/formatters/ocr/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,12 @@ def sort_bboxes(self, main_region_bboxes):
centroid = bbox.get_centriod()
bboxes[f"{centroid[0]},{centroid[1]}"] = bbox
bbox_centriods.append(centroid)
if len(bbox_centriods) == 0:
return []
sorted_bboxes = []
sort_on_y_bboxs = self.get_bbox_sorted_on_y(bbox_centriods)
if len(sort_on_y_bboxs) == 0:
return []
sorted_bbox_centriods = self.get_bbox_sorted_on_x(sort_on_y_bboxs, avg_box_height, bboxes)
for bbox_centriod in sorted_bbox_centriods:
sorted_bboxes.append(bboxes[f"{bbox_centriod[0]},{bbox_centriod[1]}"])
Expand Down Expand Up @@ -484,13 +488,15 @@ def has_abnormal_postprocessing(self, original_bboxes, postprocessed_bboxes):
return False

def build_page(self, bboxes, image_number, image_filename, state, avg_char_width=None):
if len(bboxes) == 0:
return
flatten_bboxes = []
for line_bboxes in bboxes:
for bbox in line_bboxes:
flatten_bboxes.append(bbox)
flatten_bboxes.append(bbox)
if len(flatten_bboxes) == 0:
return
sorted_bboxes = self.sort_bboxes(flatten_bboxes)
if len(sorted_bboxes) == 0:
return
bbox_lines = self.get_bbox_lines(sorted_bboxes)
if self.check_postprocessing and self.has_abnormal_postprocessing(bboxes, bbox_lines):
bbox_lines = bboxes
Expand Down Expand Up @@ -596,7 +602,10 @@ def build_base(self, image_group_id):
# enumerate starts at 0 but image numbers start at 1
bboxes, avg_char_width = self.get_bboxes_for_page(image_group_id, image_filename)
if bboxes:
self.build_page(bboxes, image_number+1, image_filename, state, avg_char_width)
try:
self.build_page(bboxes, image_number+1, image_filename, state, avg_char_width)
except:
logger.error("error while building page")
layers = {}
if state["pagination_annotations"]:
layer = Layer(annotation_type=LayerEnum.pagination, annotations=state["pagination_annotations"])
Expand Down

0 comments on commit 2a7b401

Please sign in to comment.