Resolve issue first records only with share class name but without fund name (in previous page text).
This commit is contained in:
parent
f166e73362
commit
3f2bb38208
|
|
@ -130,12 +130,25 @@ class DataExtraction:
|
||||||
data_list = []
|
data_list = []
|
||||||
pdf_page_count = len(self.page_text_dict.keys())
|
pdf_page_count = len(self.page_text_dict.keys())
|
||||||
handled_page_num_list = []
|
handled_page_num_list = []
|
||||||
|
|
||||||
|
previous_page_num = -1
|
||||||
|
previous_page_datapoints = []
|
||||||
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
page_datapoints = self.get_datapoints_by_page_num(page_num)
|
page_datapoints = self.get_datapoints_by_page_num(page_num)
|
||||||
if len(page_datapoints) == 0:
|
if len(page_datapoints) == 0:
|
||||||
continue
|
continue
|
||||||
|
if previous_page_num == page_num - 1 and \
|
||||||
|
previous_page_datapoints == page_datapoints and \
|
||||||
|
previous_page_fund_name is not None:
|
||||||
|
# Transfer previous page fund name to be the pre-fix of page text
|
||||||
|
# The purpose is to get fund name if the first records without fund name
|
||||||
|
# example document: 431073795, page index 1727 to 1728
|
||||||
|
logger.info(f"Transfer previous page fund name: {previous_page_fund_name} to be the pre-fix of page text")
|
||||||
|
page_text = f"\n{previous_page_fund_name}\n{page_text}"
|
||||||
|
|
||||||
extract_data = self.extract_data_by_page(
|
extract_data = self.extract_data_by_page(
|
||||||
page_num,
|
page_num,
|
||||||
page_text,
|
page_text,
|
||||||
|
|
@ -145,7 +158,12 @@ class DataExtraction:
|
||||||
)
|
)
|
||||||
data_list.append(extract_data)
|
data_list.append(extract_data)
|
||||||
|
|
||||||
|
|
||||||
page_data_list = extract_data.get("extract_data", {}).get("data", [])
|
page_data_list = extract_data.get("extract_data", {}).get("data", [])
|
||||||
|
if len(page_data_list) > 0:
|
||||||
|
previous_page_num = page_num
|
||||||
|
previous_page_fund_name = page_data_list[-1].get("fund_name", "")
|
||||||
|
previous_page_datapoints = page_datapoints
|
||||||
|
|
||||||
current_page_data_count = len(page_data_list)
|
current_page_data_count = len(page_data_list)
|
||||||
if current_page_data_count > 0:
|
if current_page_data_count > 0:
|
||||||
|
|
|
||||||
40
main.py
40
main.py
|
|
@ -786,33 +786,33 @@ if __name__ == "__main__":
|
||||||
# ]
|
# ]
|
||||||
# Documents in EMEA Case 1.docx
|
# Documents in EMEA Case 1.docx
|
||||||
check_db_mapping_doc_id_list = [
|
check_db_mapping_doc_id_list = [
|
||||||
"435128656",
|
|
||||||
"425480144",
|
|
||||||
"466528487",
|
|
||||||
"434902020",
|
|
||||||
"440029306",
|
|
||||||
"431073795",
|
|
||||||
"430240853",
|
|
||||||
"427637151",
|
|
||||||
"434924914",
|
|
||||||
"467595142",
|
|
||||||
"466859621",
|
|
||||||
"429564034",
|
|
||||||
"424976833",
|
"424976833",
|
||||||
"466860852",
|
"425480144",
|
||||||
"466371135",
|
"427637151",
|
||||||
"470515549",
|
"429564034",
|
||||||
"434851173",
|
|
||||||
"434710819",
|
|
||||||
"429950833",
|
"429950833",
|
||||||
"467788879"
|
"430240853",
|
||||||
|
"431073795",
|
||||||
|
"434710819",
|
||||||
|
"434851173",
|
||||||
|
"434902020",
|
||||||
|
"434924914",
|
||||||
|
"435128656",
|
||||||
|
"440029306",
|
||||||
|
"466371135",
|
||||||
|
"466528487",
|
||||||
|
"466859621",
|
||||||
|
"466860852",
|
||||||
|
"467595142",
|
||||||
|
"467788879",
|
||||||
|
"470515549"
|
||||||
]
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["514213638"]
|
special_doc_id_list = ["431073795"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue