in processing/processing/text_merge.py [0:0]
def match_page(words: Dict[str, Any], page: Page) -> MatchedPage:
"""Match words bboxes to annotation bboxes."""
matched_page = MatchedPage(page_num=page.page_num, paragraph_bboxes={})
for raw_word_bbox in words["objs"]:
if raw_word_bbox["type"] != "text":
continue
word_bbox = BorderBox(
raw_word_bbox["bbox"][0],
raw_word_bbox["bbox"][1],
raw_word_bbox["bbox"][2],
raw_word_bbox["bbox"][3],
)
for paragraph in page.objs:
if word_bbox.box_is_inside_box(
BorderBox(*(int(i) for i in paragraph["bbox"])), threshold=0.8
):
nest_bbox = {
"x1": word_bbox.top_left_x,
"y1": word_bbox.top_left_y,
"x2": word_bbox.bottom_right_x,
"y2": word_bbox.bottom_right_y,
"text": raw_word_bbox["text"],
}
matched_page.paragraph_bboxes.setdefault(
paragraph["id"],
ParagraphBbox(bbox=paragraph["bbox"], nested_bboxes=[]),
).nested_bboxes.append(nest_bbox)
break
return matched_page