Resolve issue first records only with share class name but without fund name (in previous page text).
This commit is contained in:
parent
f166e73362
commit
3f2bb38208
|
|
@ -130,12 +130,25 @@ class DataExtraction:
|
|||
data_list = []
|
||||
pdf_page_count = len(self.page_text_dict.keys())
|
||||
handled_page_num_list = []
|
||||
|
||||
previous_page_num = -1
|
||||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
page_datapoints = self.get_datapoints_by_page_num(page_num)
|
||||
if len(page_datapoints) == 0:
|
||||
continue
|
||||
if previous_page_num == page_num - 1 and \
|
||||
previous_page_datapoints == page_datapoints and \
|
||||
previous_page_fund_name is not None:
|
||||
# Transfer previous page fund name to be the pre-fix of page text
|
||||
# The purpose is to get fund name if the first records without fund name
|
||||
# example document: 431073795, page index 1727 to 1728
|
||||
logger.info(f"Transfer previous page fund name: {previous_page_fund_name} to be the pre-fix of page text")
|
||||
page_text = f"\n{previous_page_fund_name}\n{page_text}"
|
||||
|
||||
extract_data = self.extract_data_by_page(
|
||||
page_num,
|
||||
page_text,
|
||||
|
|
@ -144,8 +157,13 @@ class DataExtraction:
|
|||
exclude_data=None,
|
||||
)
|
||||
data_list.append(extract_data)
|
||||
|
||||
|
||||
|
||||
page_data_list = extract_data.get("extract_data", {}).get("data", [])
|
||||
if len(page_data_list) > 0:
|
||||
previous_page_num = page_num
|
||||
previous_page_fund_name = page_data_list[-1].get("fund_name", "")
|
||||
previous_page_datapoints = page_datapoints
|
||||
|
||||
current_page_data_count = len(page_data_list)
|
||||
if current_page_data_count > 0:
|
||||
|
|
|
|||
40
main.py
40
main.py
|
|
@ -786,33 +786,33 @@ if __name__ == "__main__":
|
|||
# ]
|
||||
# Documents in EMEA Case 1.docx
|
||||
check_db_mapping_doc_id_list = [
|
||||
"435128656",
|
||||
"425480144",
|
||||
"466528487",
|
||||
"434902020",
|
||||
"440029306",
|
||||
"431073795",
|
||||
"430240853",
|
||||
"427637151",
|
||||
"434924914",
|
||||
"467595142",
|
||||
"466859621",
|
||||
"429564034",
|
||||
"424976833",
|
||||
"466860852",
|
||||
"466371135",
|
||||
"470515549",
|
||||
"434851173",
|
||||
"434710819",
|
||||
"425480144",
|
||||
"427637151",
|
||||
"429564034",
|
||||
"429950833",
|
||||
"467788879"
|
||||
"430240853",
|
||||
"431073795",
|
||||
"434710819",
|
||||
"434851173",
|
||||
"434902020",
|
||||
"434924914",
|
||||
"435128656",
|
||||
"440029306",
|
||||
"466371135",
|
||||
"466528487",
|
||||
"466859621",
|
||||
"466860852",
|
||||
"467595142",
|
||||
"467788879",
|
||||
"470515549"
|
||||
]
|
||||
# special_doc_id_list = check_mapping_doc_id_list
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
special_doc_id_list = ["514213638"]
|
||||
special_doc_id_list = ["431073795"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = False
|
||||
calculate_metrics = False
|
||||
|
|
|
|||
Loading…
Reference in New Issue