optimize drilldown algorithm

2024-11-08 15:00:34 -06:00 · 2024-11-08 15:00:34 -06:00 · c34e2e960e
parent 81f855f725
commit c34e2e960e
2 changed files with 98 additions and 119 deletions
--- a/main.py
+++ b/main.py
@ -1119,44 +1119,10 @@ if __name__ == "__main__":
                            "514636990",
                            "514636993",
                            "514636994",
-                            "539794746"
+                            "539794746",
    ]
    special_doc_id_list = check_db_mapping_doc_id_list
-    special_doc_id_list = ["532500349",
+    special_doc_id_list = ["479742284",
        "535324239",
        "532442891",
        "543243650",
        "528588598",
        "532437639",
        "527525440",
        "534987291",
        "534112055",
        "533482585",
        "544208174",
        "534547266",
        "544713166",
        "526463547",
        "534535569",
        "534106067",
        "532486560",
        "532781760",
        "533727067",
        "527256381",
        "533392425",
        "532179676",
        "534300608",
        "539233950",
        "532438414",
        "533681744",
        "537654645",
        "533594905",
        "537926443",
        "533499655",
        "533862814",
        "544918611",
        "539087870",
        "536343790",
        "479742284",
                            "501380497",
                            "501380553",
                            "501380775",
@ -1186,52 +1152,37 @@ if __name__ == "__main__":
                            "514636993",
                            "514636994",
                            "539794746",
-        "292989214",
+                            "546483469",
-        "316237292",
+                            "546375582",
-        "321733631",
+                            "546375575",
-        "323390570",
+                            "546375576",
-        "327956364",
+                            "546375577",
-        "332223498",
+                            "546375568",
-        "333207452",
+                            "546371033",
-        "334718372",
+                            "546632761",
-        "344636875",
+                            "546632544",
-        "362246081",
+                            "546632464",
-        "366179419",
+                            "546724583",
-        "380945052",
+                            "546724552",
-        "382366116",
+                            "546694677",
-        "387202452",
+                            "546660422",
-        "389171486",
+                            "546638908",
-        "391456740",
+                            "546632845",
-        "391736837",
+                            "546105299",
-        "394778487",
+                            "546085481",
-        "401684600",
+                            "546078693",
-        "402113224",
+                            "546078650",
-        "402181770",
+                            "546289930",
-        "402397014",
+                            "546289910",
-        "405803396",
+                            "542967371",
-        "445102363",
+                            "542798238",
-        "445256897",
+                            "546048730",
-        "448265376",
+                            "546048143",
-        "449555622",
+                            "546047619",
-        "449623976",
+                            "546047528",
-        "458291624",
+                            "546046730",
-        "458359181",
+                            "546919329"
-        "463081566",
+                            ]
        "469138353",
        "471641628",
        "476492237",
        "478585901",
        "478586066",
        "479042264",
        "479042269",
        "479793787",
        "481475385",
        "483617247",
        "486378555",
        "486383912",
        "492121213",
        "497497599",
        "502693599"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = False
--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
@ -313,9 +313,35 @@ class PDFUtil:
                matching_val_area = page.search_for(text_block.replace('\n', '').replace('-', ''))
            if len(matching_val_area) == 0:
                matching_val_area = page.search_for(text_block.replace('-\n', ''))
        if len(matching_val_area) > 0 and len(text_block.strip().split()) == 1:
            new_matching_val_area = []
            for area in matching_val_area:
                # get text by text_bbox
                pure_text_block = text_block.strip()
                copy_area = deepcopy(area)
                copy_area.x0 -= 10
                copy_area.x1 += 10
                text = page.get_text("text", clip=copy_area).strip()
                if text == pure_text_block:
                    new_matching_val_area.append(area)
                else:
                    # get start and end index of the pure_text_block in text
                    start_index = text.find(pure_text_block)
                    if start_index > 0:
                        previous_char = text[start_index - 1]
                        if previous_char not in [" ", "("]:
                            continue
                    end_index = start_index + len(pure_text_block)
                    if end_index < len(text):
                        next_char = text[end_index]
                        if next_char not in [" ", "%", ")"]:
                            continue
                    new_matching_val_area.append(area)
            matching_val_area = new_matching_val_area
        if (
            highlight_text_inside_block is not None
            and len(highlight_text_inside_block) > 0
            and len(matching_val_area) > 0
        ):
            highlight_bbox_list = []
            merged_matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
@ -332,14 +358,14 @@ class PDFUtil:
                    else:
                        pure_number_match = pure_number_regex.match(highlight_text_inside_block)
                        if pure_number_match is not None and pure_number_match.group() == highlight_text_inside_block:
-                            for text_bbox in text_bbox_area:
+                            for area in text_bbox_area:
                                # get text by text_bbox
-                                copy_text_bbox = deepcopy(text_bbox)
+                                copy_area = deepcopy(area)
-                                copy_text_bbox.x0 -= 10
+                                copy_area.x0 -= 10
-                                copy_text_bbox.x1 += 10
+                                copy_area.x1 += 10
-                                text = page.get_text("text", clip=copy_text_bbox).strip()
+                                text = page.get_text("text", clip=copy_area).strip()
                                if text == highlight_text_inside_block:
-                                    highlight_bbox_list.append(text_bbox)
+                                    highlight_bbox_list.append(area)
                                else:
                                    # get start and end index of the highlight_text_inside_block in text
                                    start_index = text.find(highlight_text_inside_block)
@ -352,7 +378,7 @@ class PDFUtil:
                                        next_char = text[end_index]
                                        if next_char not in [" ", "%", ")"]:
                                            continue
-                                    highlight_bbox_list.append(text_bbox)
+                                    highlight_bbox_list.append(area)
                        else:
                            highlight_bbox_list.extend(text_bbox_area)
            if len(highlight_bbox_list) == 0 and len(highlight_text_inside_block.strip().split()) > 2:
@ -365,6 +391,8 @@ class PDFUtil:
                matching_val_area = [matching_val_area[0]]
        if matching_val_area is not None and len(matching_val_area) > 0:
            if (highlight_text_inside_block is not None and len(highlight_text_inside_block.strip().split()) > 1) or \
                (highlight_text_inside_block is None and len(text_block.strip().split()) > 1):
                matching_val_area = self.merge_matching_val_area(matching_val_area, merge_nearby_lines)
            if exact_match:
                matching_val_area = self.get_exact_match_area(page, matching_val_area, text_block)