prepare for 100 multi-funds document samples

This commit is contained in:
Blade He 2025-01-17 16:26:31 -06:00
parent b93a8d55e8
commit d41fae3dba
6 changed files with 323 additions and 112 deletions

View File

@ -47,11 +47,13 @@ def emea_ar_data_extract():
extract_way = "text"
os.makedirs(pdf_folder, exist_ok=True)
os.makedirs(output_pdf_text_folder, exist_ok=True)
os.makedirs(output_extract_data_folder, exist_ok=True)
os.makedirs(output_mapping_data_folder, exist_ok=True)
os.makedirs(drilldown_folder, exist_ok=True)
clean_folder(pdf_folder)
clean_folder(output_pdf_text_folder)
clean_folder(output_extract_data_folder)
clean_folder(output_mapping_data_folder)
clean_folder(drilldown_folder)

View File

@ -1,11 +1,2 @@
{
"539790009": [39, 40, 45, 46, 47],
"542300403": [12],
"542306317": [4, 15, 16, 17, 18],
"547567013": [12, 13, 14, 15, 16, 17, 33, 34, 35, 39, 40, 42, 43, 44, 45],
"552505237": [16, 17, 18, 19, 25, 26, 27],
"552505278": [12, 13, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29],
"554431052": [34, 35, 36, 41, 42, 43],
"555377021": [21, 22, 23, 24, 25, 26],
"555654388": [35, 36]
}

View File

@ -12,10 +12,16 @@ from utils.pdf_util import get_pdf_pages_by_html
class FilterPages:
def __init__(
self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, doc_source: str = "emea_ar"
self,
doc_id: str,
pdf_file: str,
document_mapping_info_df: pd.DataFrame,
doc_source: str = "emea_ar",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
) -> None:
self.doc_id = doc_id
self.pdf_file = pdf_file
self.output_pdf_text_folder = output_pdf_text_folder
self.configuration_folder = f"./configuration/{doc_source}/"
misc_config_file = os.path.join(self.configuration_folder, "misc_config.json")
if os.path.exists(misc_config_file):
@ -45,15 +51,64 @@ class FilterPages:
def get_pdf_page_text_dict(self) -> dict:
page_text_dict = {}
# if exist page text file, load it
self.page_text_file = ""
pdf2html_page_text_file = ""
pymupdf_page_text_file = ""
if self.output_pdf_text_folder is not None and len(self.output_pdf_text_folder) > 0:
os.makedirs(self.output_pdf_text_folder, exist_ok=True)
pdf2html_output_pdf_text_folder = os.path.join(
self.output_pdf_text_folder, "pdf2html/"
)
os.makedirs(pdf2html_output_pdf_text_folder, exist_ok=True)
pymupdf_output_pdf_text_folder = os.path.join(
self.output_pdf_text_folder, "pymupdf/"
)
os.makedirs(pymupdf_output_pdf_text_folder, exist_ok=True)
pdf2html_page_text_file = os.path.join(
pdf2html_output_pdf_text_folder, f"{self.doc_id}_page_text.json"
)
pymupdf_page_text_file = os.path.join(
pymupdf_output_pdf_text_folder, f"{self.doc_id}_page_text.json"
)
if os.path.exists(pdf2html_page_text_file):
self.apply_pdf2html = True
self.page_text_file = pdf2html_page_text_file
with open(pdf2html_page_text_file, "r", encoding="utf-8") as f:
page_text_dict = json.load(f)
elif os.path.exists(pymupdf_page_text_file):
self.apply_pdf2html = False
self.page_text_file = pymupdf_page_text_file
with open(pymupdf_page_text_file, "r", encoding="utf-8") as f:
page_text_dict = json.load(f)
else:
pass
if len(page_text_dict.keys()) > 0:
logger.info(f"Load page text from file: {self.page_text_file}")
# transfer the keys of page_text_dict to be int
page_text_dict = {int(k): v for k, v in page_text_dict.items()}
return page_text_dict
if self.apply_pdf2html:
page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path")
if len(page_text_dict.keys()) == 0:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
self.apply_pdf2html = False
self.page_text_file = pymupdf_page_text_file
else:
self.page_text_file = pdf2html_page_text_file
else:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
self.page_text_file = pymupdf_page_text_file
if len(self.page_text_file) > 0:
try:
with open(self.page_text_file, "w", encoding="utf-8") as f:
json.dump(page_text_dict, f, ensure_ascii=False, indent=4)
except Exception as e:
logger.error(f"Error: {e}")
return page_text_dict
def get_configuration_from_file(self) -> dict:

143
main.py
View File

@ -71,27 +71,13 @@ class EMEA_AR_Parsing:
os.makedirs(self.output_mapping_data_folder, exist_ok=True)
self.filter_pages = FilterPages(
self.doc_id, self.pdf_file, self.document_mapping_info_df, self.doc_source
self.doc_id,
self.pdf_file,
self.document_mapping_info_df,
self.doc_source,
output_pdf_text_folder
)
self.page_text_dict = self.filter_pages.page_text_dict
try:
os.makedirs(output_pdf_text_folder, exist_ok=True)
if self.filter_pages.apply_pdf2html:
output_pdf_text_folder = os.path.join(
output_pdf_text_folder, "pdf2html/"
)
else:
output_pdf_text_folder = os.path.join(
output_pdf_text_folder, "pymupdf/"
)
os.makedirs(output_pdf_text_folder, exist_ok=True)
self.page_text_file = os.path.join(
output_pdf_text_folder, f"{self.doc_id}_page_text.json"
)
with open(self.page_text_file, "w", encoding="utf-8") as f:
json.dump(self.page_text_dict, f, ensure_ascii=False, indent=4)
except Exception as e:
logger.error(f"Error: {e}")
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
self.datapoints = self.get_datapoints_from_datapoint_page_info()
@ -1079,6 +1065,7 @@ def batch_initial_document(
document_list_file: str = "sample_document_complex.txt",
doc_source: str = "emea_ar",
pdf_folder: str = r"/data/emea_ar/pdf/",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
):
@ -1094,6 +1081,7 @@ def batch_initial_document(
doc_id=doc_id,
doc_source=doc_source,
pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_folder=output_extract_data_child_folder,
output_mapping_data_folder=output_mapping_child_folder,
)
@ -1319,74 +1307,81 @@ def merge_output_data_aus_prospectus(
if __name__ == "__main__":
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
os.makedirs(merged_total_data_folder, exist_ok=True)
data_file_base_name = os.path.basename(data_file_path)
output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
# os.makedirs(merged_total_data_folder, exist_ok=True)
# data_file_base_name = os.path.basename(data_file_path)
# output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
# merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
# doc_source = "aus_prospectus"
# sample_document_list_folder: str = r'./sample_documents/'
# document_list_file: str = "aus_prospectus.txt"
# document_list_file: str = "aus_prospectus_100_documents_multi_fund_sample.txt"
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
# output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
# output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/"
# batch_initial_document(sample_document_list_folder=sample_document_list_folder,
# document_list_file=document_list_file,
# doc_source=doc_source,
# pdf_folder=pdf_folder,
# output_pdf_text_folder=output_pdf_text_folder,
# output_extract_data_child_folder=output_extract_data_child_folder,
# output_mapping_child_folder=output_mapping_child_folder)
# special_doc_id_list = ["553242411"]
# doc_source = "aus_prospectus"
# if doc_source == "aus_prospectus":
# special_doc_id_list: list = [
# "539790009",
# "542300403",
# "542301117",
# "542306317",
# "547567013",
# "552505237",
# "552505278",
# "554431052",
# "554851189",
# "555377021",
# "555654388",
# ]
# # special_doc_id_list: list = ["554851189"]
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
# output_extract_data_child_folder: str = (
# r"/data/aus_prospectus/output/extract_data/docs/"
# )
# output_extract_data_total_folder: str = (
# r"/data/aus_prospectus/output/extract_data/total/"
# )
# output_mapping_child_folder: str = (
# r"/data/aus_prospectus/output/mapping_data/docs/"
# )
# output_mapping_total_folder: str = (
# r"/data/aus_prospectus/output/mapping_data/total/"
# )
# drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
# batch_run_documents(
# doc_source=doc_source,
# special_doc_id_list=special_doc_id_list,
# pdf_folder=pdf_folder,
# output_pdf_text_folder=output_pdf_text_folder,
# output_extract_data_child_folder=output_extract_data_child_folder,
# output_extract_data_total_folder=output_extract_data_total_folder,
# output_mapping_child_folder=output_mapping_child_folder,
# output_mapping_total_folder=output_mapping_total_folder,
# drilldown_folder=drilldown_folder,
# )
# elif doc_source == "emea_ar":
# special_doc_id_list = ["553242408"]
# batch_run_documents(
# doc_source=doc_source, special_doc_id_list=special_doc_id_list
# )
doc_source = "aus_prospectus"
if doc_source == "aus_prospectus":
document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
# special_doc_id_list: list = [
# "539790009",
# "542300403",
# "542301117",
# "542306317",
# "547567013",
# "552505237",
# "552505278",
# "554431052",
# "554851189",
# "555377021",
# "555654388",
# ]
# special_doc_id_list: list = ["554851189"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (
r"/data/aus_prospectus/output/extract_data/docs/"
)
output_extract_data_total_folder: str = (
r"/data/aus_prospectus/output/extract_data/total/"
)
output_mapping_child_folder: str = (
r"/data/aus_prospectus/output/mapping_data/docs/"
)
output_mapping_total_folder: str = (
r"/data/aus_prospectus/output/mapping_data/total/"
)
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
batch_run_documents(
doc_source=doc_source,
special_doc_id_list=special_doc_id_list,
pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_child_folder=output_extract_data_child_folder,
output_extract_data_total_folder=output_extract_data_total_folder,
output_mapping_child_folder=output_mapping_child_folder,
output_mapping_total_folder=output_mapping_total_folder,
drilldown_folder=drilldown_folder,
)
elif doc_source == "emea_ar":
special_doc_id_list = ["553242408"]
batch_run_documents(
doc_source=doc_source, special_doc_id_list=special_doc_id_list
)
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"

View File

@ -12,7 +12,6 @@ import json_repair
from utils.logger import logger
from utils.pdf_download import download_pdf_from_documents_warehouse
from utils.pdf_util import PDFUtil
from pdf_table_extraction import PDFTableExtraction
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
@ -78,23 +77,6 @@ def output_pdf_page_text(pdf_folder: str, output_folder: str):
logger.info(f"Successfully extracted text from {pdf_file}")
def extract_pdf_table(pdf_folder: str, output_folder: str):
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
logger.error(f"Invalid pdf_folder: {pdf_folder}")
return
if output_folder is None or len(output_folder) == 0:
logger.error(f"Invalid output_folder: {output_folder}")
return
os.makedirs(output_folder, exist_ok=True)
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
for pdf_file in pdf_files:
logger.info(f"Start processing {pdf_file}")
pdf_table_extraction = PDFTableExtraction(pdf_file, output_folder)
pdf_table_extraction.extract_tables()
def analyze_json_error():
text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt"
with open(text_file, "r", encoding="utf-8") as file:
@ -1385,15 +1367,100 @@ def merge_aus_document_prospectus_data():
aus_document_prospectus_data.to_excel(
writer, sheet_name="aus_document_prospectus", index=False
)
def pdf_exist():
data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
data_file = os.path.join(data_folder, "aus_100_document_prospectus_multi_fund.xlsx")
percentile_result_df = pd.read_excel(data_file, sheet_name="percentile_result")
document_id_list = percentile_result_df["DocumentId"].unique().tolist()
pdf_doc_path = r"/data/aus_prospectus/pdf/"
for doc_id in document_id_list:
pdf_file_path = os.path.join(pdf_doc_path, f"{doc_id}.pdf")
if not os.path.exists(pdf_file_path):
logger.error(f"pdf file not exist: {pdf_file_path}")
else:
logger.info(f"pdf file exist: {pdf_file_path}")
def get_pdf_2_html():
pass
def prepare_multi_fund_aus_prospectus_document():
data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
document_mapping_file = os.path.join(data_folder, "document_mapping.xlsx")
document_data_file = os.path.join(data_folder, "aus_document_prospectus.xlsx")
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name="document_fund_count")
document_data_df = pd.read_excel(document_data_file, sheet_name="aus_document_prospectus")
document_data_df.fillna("", inplace=True)
# get data from document_data_df which SecurityName is not empty string
document_data_df = document_data_df[document_data_df["SecurityName"] != ""]
document_id_list = document_data_df["DocumentId"].unique().tolist()
# get document which fund count > 1
document_fund_count_df = document_fund_count_df[document_fund_count_df["DocumentId"].isin(document_id_list)]
document_fund_count_df = document_fund_count_df[document_fund_count_df["DistinctFundCount"] > 1]
document_fund_count_df = document_fund_count_df.sort_values(by="DistinctFundCount", ascending=False)
# Calculate percentile
percentiles = [0, 0.3, 0.6, 1]
quantile_values = document_fund_count_df['DistinctFundCount'].quantile(percentiles)
# Group by percentile
bins = [quantile_values[0], quantile_values[0.3], quantile_values[0.6], quantile_values[1]]
document_fund_count_df['Percentile_Group'] = pd.cut(document_fund_count_df['DistinctFundCount'], bins=bins, labels=["0-30", "30-60", "60-100"], include_lowest=True)
# Get relevant samples based on percentile group
percentile_result = pd.DataFrame()
for group, count in zip(["0-30", "30-60", "60-100"], [30, 30, 40]):
group_df = document_fund_count_df[document_fund_count_df['Percentile_Group'] == group]
sampled_df = group_df.sample(n=min(len(group_df), count), random_state=42)
percentile_result = pd.concat([percentile_result, sampled_df], ignore_index=True)
percentile_result.reset_index(drop=True, inplace=True)
document_id_list = percentile_result["DocumentId"].unique().tolist()
final_document_mapping_df = document_mapping_df[document_mapping_df["DocumentId"].isin(document_id_list)]
# order by DocumentId, FundLegalName, FundClassLegalName
final_document_mapping_df = final_document_mapping_df.sort_values(by=["DocumentId", "FundLegalName", "FundClassLegalName"], ascending=True)
final_document_mapping_df.reset_index(drop=True, inplace=True)
# get CompanyId, CompanyName from final_document_mapping_df
final_document_provider_df = final_document_mapping_df[["CompanyId", "CompanyName"]].drop_duplicates()
# order by CompanyName
final_document_provider_df = final_document_provider_df.sort_values(by="CompanyName", ascending=True)
final_document_provider_df.reset_index(drop=True, inplace=True)
final_document_data_df = document_data_df[document_data_df["DocumentId"].isin(document_id_list)]
# order by DocumentId, FundLegalName, FundClassLegalName
final_document_data_df = final_document_data_df.sort_values(by=["DocumentId", "FundLegalName", "FundClassLegalName"], ascending=True)
final_document_data_df.reset_index(drop=True, inplace=True)
output_file = os.path.join(data_folder, "aus_100_document_prospectus_multi_fund.xlsx")
with pd.ExcelWriter(output_file) as writer:
final_document_mapping_df.to_excel(
writer, sheet_name="document_mapping", index=False
)
final_document_provider_df.to_excel(
writer, sheet_name="document_provider", index=False
)
final_document_data_df.to_excel(
writer, sheet_name="aus_document_data", index=False
)
percentile_result.to_excel(
writer, sheet_name="percentile_result", index=False
)
output_sample_document_file = os.path.join(r"./sample_documents/",
"aus_prospectus_100_documents_multi_fund_sample.txt")
# output document id to txt file
with open(output_sample_document_file, "w") as f:
for doc_id in document_id_list:
f.write(f"{doc_id}\n")
if __name__ == "__main__":
merge_aus_document_prospectus_data()
# pdf_exist()
prepare_multi_fund_aus_prospectus_document()
# merge_aus_document_prospectus_data()
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
file_name = "doc_ar_data_for_emea_11_06.xlsx"
# get_document_with_all_4_data_points(folder, file_name, None)
@ -1447,18 +1514,19 @@ if __name__ == "__main__":
output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx"
pdf_folder = r"/data/illume/japan_prospectus/pdf/"
doc_ar_data_file_path = None
doc_mapping_data_file_path = r"/data/illume/japan_prospectus/materials/document_mapping.xlsx"
output_data_folder = r"/data/illume/japan_prospectus/materials/"
output_file = "japan_prospectus_statistics.xlsx"
# pdf_folder = r"/data/aus_prospectus/pdf/"
# doc_ar_data_file_path = None
# doc_mapping_data_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
# output_data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
# output_file = "aus_100_document_prospectus_multi_fund_statistics.xlsx"
# statistics_document(pdf_folder=pdf_folder,
# doc_mapping_file_path=doc_mapping_data_file_path,
# doc_ar_data_file_path=doc_ar_data_file_path,
# mapping_sheet_name="Sheet1",
# ar_data_sheet_name="doc_ar_data_in_db",
# mapping_sheet_name="document_mapping",
# ar_data_sheet_name="aus_document_data",
# output_folder=output_data_folder,
# output_file=output_file)
# get_document_extracted_share_diff_by_db()
# statistics_provider_mapping(
# provider_mapping_data_file=provider_mapping_data_file,

View File

@ -0,0 +1,100 @@
539559156
544218181
541357691
539601979
518496603
498429337
515319827
500162212
557034899
520954335
555654388
521324579
539791416
520663234
534711945
551941891
509391262
521618542
557362556
556248266
497151360
521625399
539559165
539601349
557362553
524631914
530101994
533499706
557526104
521618713
536014247
534287530
553449194
526644754
550769189
526200526
539604078
535821734
511351053
515041837
550026958
554124220
539794750
539580710
526200513
557065365
551941902
535827969
535825845
557526108
544886026
539794737
540028470
552995727
517738771
521630976
539601982
529881356
546269437
521822717
515499515
521617820
539791633
524248225
492008222
522608195
515978340
560485097
546289968
558726012
547567013
492202154
520698753
540453691
539791362
535825766
534287518
552727485
521617806
539266817
557362542
522879836
535821752
528208771
518492363
552505270
502017146
545500792
539604126
503209562
521822710
550533961
513111748
545750078
539581666
521609521
542630657
491033056
560260579
521618962