prepare for 100 multi-funds document samples
This commit is contained in:
parent
b93a8d55e8
commit
d41fae3dba
|
|
@ -47,11 +47,13 @@ def emea_ar_data_extract():
|
||||||
extract_way = "text"
|
extract_way = "text"
|
||||||
|
|
||||||
os.makedirs(pdf_folder, exist_ok=True)
|
os.makedirs(pdf_folder, exist_ok=True)
|
||||||
|
os.makedirs(output_pdf_text_folder, exist_ok=True)
|
||||||
os.makedirs(output_extract_data_folder, exist_ok=True)
|
os.makedirs(output_extract_data_folder, exist_ok=True)
|
||||||
os.makedirs(output_mapping_data_folder, exist_ok=True)
|
os.makedirs(output_mapping_data_folder, exist_ok=True)
|
||||||
os.makedirs(drilldown_folder, exist_ok=True)
|
os.makedirs(drilldown_folder, exist_ok=True)
|
||||||
|
|
||||||
clean_folder(pdf_folder)
|
clean_folder(pdf_folder)
|
||||||
|
clean_folder(output_pdf_text_folder)
|
||||||
clean_folder(output_extract_data_folder)
|
clean_folder(output_extract_data_folder)
|
||||||
clean_folder(output_mapping_data_folder)
|
clean_folder(output_mapping_data_folder)
|
||||||
clean_folder(drilldown_folder)
|
clean_folder(drilldown_folder)
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,2 @@
|
||||||
{
|
{
|
||||||
"539790009": [39, 40, 45, 46, 47],
|
|
||||||
"542300403": [12],
|
|
||||||
"542306317": [4, 15, 16, 17, 18],
|
|
||||||
"547567013": [12, 13, 14, 15, 16, 17, 33, 34, 35, 39, 40, 42, 43, 44, 45],
|
|
||||||
"552505237": [16, 17, 18, 19, 25, 26, 27],
|
|
||||||
"552505278": [12, 13, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29],
|
|
||||||
"554431052": [34, 35, 36, 41, 42, 43],
|
|
||||||
"555377021": [21, 22, 23, 24, 25, 26],
|
|
||||||
"555654388": [35, 36]
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -12,10 +12,16 @@ from utils.pdf_util import get_pdf_pages_by_html
|
||||||
|
|
||||||
class FilterPages:
|
class FilterPages:
|
||||||
def __init__(
|
def __init__(
|
||||||
self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, doc_source: str = "emea_ar"
|
self,
|
||||||
|
doc_id: str,
|
||||||
|
pdf_file: str,
|
||||||
|
document_mapping_info_df: pd.DataFrame,
|
||||||
|
doc_source: str = "emea_ar",
|
||||||
|
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
|
||||||
) -> None:
|
) -> None:
|
||||||
self.doc_id = doc_id
|
self.doc_id = doc_id
|
||||||
self.pdf_file = pdf_file
|
self.pdf_file = pdf_file
|
||||||
|
self.output_pdf_text_folder = output_pdf_text_folder
|
||||||
self.configuration_folder = f"./configuration/{doc_source}/"
|
self.configuration_folder = f"./configuration/{doc_source}/"
|
||||||
misc_config_file = os.path.join(self.configuration_folder, "misc_config.json")
|
misc_config_file = os.path.join(self.configuration_folder, "misc_config.json")
|
||||||
if os.path.exists(misc_config_file):
|
if os.path.exists(misc_config_file):
|
||||||
|
|
@ -45,15 +51,64 @@ class FilterPages:
|
||||||
|
|
||||||
def get_pdf_page_text_dict(self) -> dict:
|
def get_pdf_page_text_dict(self) -> dict:
|
||||||
page_text_dict = {}
|
page_text_dict = {}
|
||||||
|
# if exist page text file, load it
|
||||||
|
self.page_text_file = ""
|
||||||
|
pdf2html_page_text_file = ""
|
||||||
|
pymupdf_page_text_file = ""
|
||||||
|
if self.output_pdf_text_folder is not None and len(self.output_pdf_text_folder) > 0:
|
||||||
|
os.makedirs(self.output_pdf_text_folder, exist_ok=True)
|
||||||
|
pdf2html_output_pdf_text_folder = os.path.join(
|
||||||
|
self.output_pdf_text_folder, "pdf2html/"
|
||||||
|
)
|
||||||
|
os.makedirs(pdf2html_output_pdf_text_folder, exist_ok=True)
|
||||||
|
pymupdf_output_pdf_text_folder = os.path.join(
|
||||||
|
self.output_pdf_text_folder, "pymupdf/"
|
||||||
|
)
|
||||||
|
os.makedirs(pymupdf_output_pdf_text_folder, exist_ok=True)
|
||||||
|
pdf2html_page_text_file = os.path.join(
|
||||||
|
pdf2html_output_pdf_text_folder, f"{self.doc_id}_page_text.json"
|
||||||
|
)
|
||||||
|
pymupdf_page_text_file = os.path.join(
|
||||||
|
pymupdf_output_pdf_text_folder, f"{self.doc_id}_page_text.json"
|
||||||
|
)
|
||||||
|
if os.path.exists(pdf2html_page_text_file):
|
||||||
|
self.apply_pdf2html = True
|
||||||
|
self.page_text_file = pdf2html_page_text_file
|
||||||
|
with open(pdf2html_page_text_file, "r", encoding="utf-8") as f:
|
||||||
|
page_text_dict = json.load(f)
|
||||||
|
elif os.path.exists(pymupdf_page_text_file):
|
||||||
|
self.apply_pdf2html = False
|
||||||
|
self.page_text_file = pymupdf_page_text_file
|
||||||
|
with open(pymupdf_page_text_file, "r", encoding="utf-8") as f:
|
||||||
|
page_text_dict = json.load(f)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
if len(page_text_dict.keys()) > 0:
|
||||||
|
logger.info(f"Load page text from file: {self.page_text_file}")
|
||||||
|
# transfer the keys of page_text_dict to be int
|
||||||
|
page_text_dict = {int(k): v for k, v in page_text_dict.items()}
|
||||||
|
return page_text_dict
|
||||||
|
|
||||||
if self.apply_pdf2html:
|
if self.apply_pdf2html:
|
||||||
page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path")
|
page_text_dict = get_pdf_pages_by_html(self.pdf_file, pdf_info_type="pdf_path")
|
||||||
if len(page_text_dict.keys()) == 0:
|
if len(page_text_dict.keys()) == 0:
|
||||||
pdf_util = PDFUtil(self.pdf_file)
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
success, text, page_text_dict = pdf_util.extract_text()
|
success, text, page_text_dict = pdf_util.extract_text()
|
||||||
self.apply_pdf2html = False
|
self.apply_pdf2html = False
|
||||||
|
self.page_text_file = pymupdf_page_text_file
|
||||||
|
else:
|
||||||
|
self.page_text_file = pdf2html_page_text_file
|
||||||
else:
|
else:
|
||||||
pdf_util = PDFUtil(self.pdf_file)
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
success, text, page_text_dict = pdf_util.extract_text()
|
success, text, page_text_dict = pdf_util.extract_text()
|
||||||
|
self.page_text_file = pymupdf_page_text_file
|
||||||
|
|
||||||
|
if len(self.page_text_file) > 0:
|
||||||
|
try:
|
||||||
|
with open(self.page_text_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(page_text_dict, f, ensure_ascii=False, indent=4)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
return page_text_dict
|
return page_text_dict
|
||||||
|
|
||||||
def get_configuration_from_file(self) -> dict:
|
def get_configuration_from_file(self) -> dict:
|
||||||
|
|
|
||||||
117
main.py
117
main.py
|
|
@ -71,27 +71,13 @@ class EMEA_AR_Parsing:
|
||||||
os.makedirs(self.output_mapping_data_folder, exist_ok=True)
|
os.makedirs(self.output_mapping_data_folder, exist_ok=True)
|
||||||
|
|
||||||
self.filter_pages = FilterPages(
|
self.filter_pages = FilterPages(
|
||||||
self.doc_id, self.pdf_file, self.document_mapping_info_df, self.doc_source
|
self.doc_id,
|
||||||
|
self.pdf_file,
|
||||||
|
self.document_mapping_info_df,
|
||||||
|
self.doc_source,
|
||||||
|
output_pdf_text_folder
|
||||||
)
|
)
|
||||||
self.page_text_dict = self.filter_pages.page_text_dict
|
self.page_text_dict = self.filter_pages.page_text_dict
|
||||||
try:
|
|
||||||
os.makedirs(output_pdf_text_folder, exist_ok=True)
|
|
||||||
if self.filter_pages.apply_pdf2html:
|
|
||||||
output_pdf_text_folder = os.path.join(
|
|
||||||
output_pdf_text_folder, "pdf2html/"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
output_pdf_text_folder = os.path.join(
|
|
||||||
output_pdf_text_folder, "pymupdf/"
|
|
||||||
)
|
|
||||||
os.makedirs(output_pdf_text_folder, exist_ok=True)
|
|
||||||
self.page_text_file = os.path.join(
|
|
||||||
output_pdf_text_folder, f"{self.doc_id}_page_text.json"
|
|
||||||
)
|
|
||||||
with open(self.page_text_file, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(self.page_text_dict, f, ensure_ascii=False, indent=4)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error: {e}")
|
|
||||||
|
|
||||||
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
|
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
|
||||||
self.datapoints = self.get_datapoints_from_datapoint_page_info()
|
self.datapoints = self.get_datapoints_from_datapoint_page_info()
|
||||||
|
|
@ -1079,6 +1065,7 @@ def batch_initial_document(
|
||||||
document_list_file: str = "sample_document_complex.txt",
|
document_list_file: str = "sample_document_complex.txt",
|
||||||
doc_source: str = "emea_ar",
|
doc_source: str = "emea_ar",
|
||||||
pdf_folder: str = r"/data/emea_ar/pdf/",
|
pdf_folder: str = r"/data/emea_ar/pdf/",
|
||||||
|
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
|
||||||
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
|
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
|
||||||
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
||||||
):
|
):
|
||||||
|
|
@ -1094,6 +1081,7 @@ def batch_initial_document(
|
||||||
doc_id=doc_id,
|
doc_id=doc_id,
|
||||||
doc_source=doc_source,
|
doc_source=doc_source,
|
||||||
pdf_folder=pdf_folder,
|
pdf_folder=pdf_folder,
|
||||||
|
output_pdf_text_folder=output_pdf_text_folder,
|
||||||
output_extract_data_folder=output_extract_data_child_folder,
|
output_extract_data_folder=output_extract_data_child_folder,
|
||||||
output_mapping_data_folder=output_mapping_child_folder,
|
output_mapping_data_folder=output_mapping_child_folder,
|
||||||
)
|
)
|
||||||
|
|
@ -1319,29 +1307,36 @@ def merge_output_data_aus_prospectus(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
|
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
|
||||||
document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
|
||||||
merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
||||||
os.makedirs(merged_total_data_folder, exist_ok=True)
|
# os.makedirs(merged_total_data_folder, exist_ok=True)
|
||||||
data_file_base_name = os.path.basename(data_file_path)
|
# data_file_base_name = os.path.basename(data_file_path)
|
||||||
output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
|
# output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
|
||||||
merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
|
# merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
|
||||||
|
|
||||||
|
# doc_source = "aus_prospectus"
|
||||||
# sample_document_list_folder: str = r'./sample_documents/'
|
# sample_document_list_folder: str = r'./sample_documents/'
|
||||||
# document_list_file: str = "aus_prospectus.txt"
|
# document_list_file: str = "aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||||
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
|
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
# output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
|
# output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
|
||||||
# output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/"
|
# output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/"
|
||||||
# batch_initial_document(sample_document_list_folder=sample_document_list_folder,
|
# batch_initial_document(sample_document_list_folder=sample_document_list_folder,
|
||||||
# document_list_file=document_list_file,
|
# document_list_file=document_list_file,
|
||||||
|
# doc_source=doc_source,
|
||||||
# pdf_folder=pdf_folder,
|
# pdf_folder=pdf_folder,
|
||||||
|
# output_pdf_text_folder=output_pdf_text_folder,
|
||||||
# output_extract_data_child_folder=output_extract_data_child_folder,
|
# output_extract_data_child_folder=output_extract_data_child_folder,
|
||||||
# output_mapping_child_folder=output_mapping_child_folder)
|
# output_mapping_child_folder=output_mapping_child_folder)
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
|
|
||||||
# doc_source = "aus_prospectus"
|
doc_source = "aus_prospectus"
|
||||||
# if doc_source == "aus_prospectus":
|
if doc_source == "aus_prospectus":
|
||||||
|
document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||||
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||||
# special_doc_id_list: list = [
|
# special_doc_id_list: list = [
|
||||||
# "539790009",
|
# "539790009",
|
||||||
# "542300403",
|
# "542300403",
|
||||||
|
|
@ -1355,38 +1350,38 @@ if __name__ == "__main__":
|
||||||
# "555377021",
|
# "555377021",
|
||||||
# "555654388",
|
# "555654388",
|
||||||
# ]
|
# ]
|
||||||
# # special_doc_id_list: list = ["554851189"]
|
# special_doc_id_list: list = ["554851189"]
|
||||||
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
# output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
# r"/data/aus_prospectus/output/extract_data/docs/"
|
r"/data/aus_prospectus/output/extract_data/docs/"
|
||||||
# )
|
)
|
||||||
# output_extract_data_total_folder: str = (
|
output_extract_data_total_folder: str = (
|
||||||
# r"/data/aus_prospectus/output/extract_data/total/"
|
r"/data/aus_prospectus/output/extract_data/total/"
|
||||||
# )
|
)
|
||||||
# output_mapping_child_folder: str = (
|
output_mapping_child_folder: str = (
|
||||||
# r"/data/aus_prospectus/output/mapping_data/docs/"
|
r"/data/aus_prospectus/output/mapping_data/docs/"
|
||||||
# )
|
)
|
||||||
# output_mapping_total_folder: str = (
|
output_mapping_total_folder: str = (
|
||||||
# r"/data/aus_prospectus/output/mapping_data/total/"
|
r"/data/aus_prospectus/output/mapping_data/total/"
|
||||||
# )
|
)
|
||||||
# drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
|
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
|
||||||
# batch_run_documents(
|
batch_run_documents(
|
||||||
# doc_source=doc_source,
|
doc_source=doc_source,
|
||||||
# special_doc_id_list=special_doc_id_list,
|
special_doc_id_list=special_doc_id_list,
|
||||||
# pdf_folder=pdf_folder,
|
pdf_folder=pdf_folder,
|
||||||
# output_pdf_text_folder=output_pdf_text_folder,
|
output_pdf_text_folder=output_pdf_text_folder,
|
||||||
# output_extract_data_child_folder=output_extract_data_child_folder,
|
output_extract_data_child_folder=output_extract_data_child_folder,
|
||||||
# output_extract_data_total_folder=output_extract_data_total_folder,
|
output_extract_data_total_folder=output_extract_data_total_folder,
|
||||||
# output_mapping_child_folder=output_mapping_child_folder,
|
output_mapping_child_folder=output_mapping_child_folder,
|
||||||
# output_mapping_total_folder=output_mapping_total_folder,
|
output_mapping_total_folder=output_mapping_total_folder,
|
||||||
# drilldown_folder=drilldown_folder,
|
drilldown_folder=drilldown_folder,
|
||||||
# )
|
)
|
||||||
# elif doc_source == "emea_ar":
|
elif doc_source == "emea_ar":
|
||||||
# special_doc_id_list = ["553242408"]
|
special_doc_id_list = ["553242408"]
|
||||||
# batch_run_documents(
|
batch_run_documents(
|
||||||
# doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
||||||
# )
|
)
|
||||||
|
|
||||||
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
||||||
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
||||||
|
|
|
||||||
124
prepare_data.py
124
prepare_data.py
|
|
@ -12,7 +12,6 @@ import json_repair
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from utils.pdf_download import download_pdf_from_documents_warehouse
|
from utils.pdf_download import download_pdf_from_documents_warehouse
|
||||||
from utils.pdf_util import PDFUtil
|
from utils.pdf_util import PDFUtil
|
||||||
from pdf_table_extraction import PDFTableExtraction
|
|
||||||
|
|
||||||
|
|
||||||
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
|
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
|
||||||
|
|
@ -78,23 +77,6 @@ def output_pdf_page_text(pdf_folder: str, output_folder: str):
|
||||||
logger.info(f"Successfully extracted text from {pdf_file}")
|
logger.info(f"Successfully extracted text from {pdf_file}")
|
||||||
|
|
||||||
|
|
||||||
def extract_pdf_table(pdf_folder: str, output_folder: str):
|
|
||||||
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
|
|
||||||
logger.error(f"Invalid pdf_folder: {pdf_folder}")
|
|
||||||
return
|
|
||||||
if output_folder is None or len(output_folder) == 0:
|
|
||||||
logger.error(f"Invalid output_folder: {output_folder}")
|
|
||||||
return
|
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
|
||||||
|
|
||||||
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
|
||||||
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
|
||||||
for pdf_file in pdf_files:
|
|
||||||
logger.info(f"Start processing {pdf_file}")
|
|
||||||
pdf_table_extraction = PDFTableExtraction(pdf_file, output_folder)
|
|
||||||
pdf_table_extraction.extract_tables()
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_json_error():
|
def analyze_json_error():
|
||||||
text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt"
|
text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt"
|
||||||
with open(text_file, "r", encoding="utf-8") as file:
|
with open(text_file, "r", encoding="utf-8") as file:
|
||||||
|
|
@ -1387,13 +1369,98 @@ def merge_aus_document_prospectus_data():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_2_html():
|
def pdf_exist():
|
||||||
pass
|
data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
|
||||||
|
data_file = os.path.join(data_folder, "aus_100_document_prospectus_multi_fund.xlsx")
|
||||||
|
percentile_result_df = pd.read_excel(data_file, sheet_name="percentile_result")
|
||||||
|
document_id_list = percentile_result_df["DocumentId"].unique().tolist()
|
||||||
|
|
||||||
|
pdf_doc_path = r"/data/aus_prospectus/pdf/"
|
||||||
|
for doc_id in document_id_list:
|
||||||
|
pdf_file_path = os.path.join(pdf_doc_path, f"{doc_id}.pdf")
|
||||||
|
if not os.path.exists(pdf_file_path):
|
||||||
|
logger.error(f"pdf file not exist: {pdf_file_path}")
|
||||||
|
else:
|
||||||
|
logger.info(f"pdf file exist: {pdf_file_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_multi_fund_aus_prospectus_document():
|
||||||
|
data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
|
||||||
|
document_mapping_file = os.path.join(data_folder, "document_mapping.xlsx")
|
||||||
|
document_data_file = os.path.join(data_folder, "aus_document_prospectus.xlsx")
|
||||||
|
|
||||||
|
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
|
||||||
|
document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name="document_fund_count")
|
||||||
|
|
||||||
|
document_data_df = pd.read_excel(document_data_file, sheet_name="aus_document_prospectus")
|
||||||
|
document_data_df.fillna("", inplace=True)
|
||||||
|
# get data from document_data_df which SecurityName is not empty string
|
||||||
|
document_data_df = document_data_df[document_data_df["SecurityName"] != ""]
|
||||||
|
document_id_list = document_data_df["DocumentId"].unique().tolist()
|
||||||
|
|
||||||
|
# get document which fund count > 1
|
||||||
|
document_fund_count_df = document_fund_count_df[document_fund_count_df["DocumentId"].isin(document_id_list)]
|
||||||
|
document_fund_count_df = document_fund_count_df[document_fund_count_df["DistinctFundCount"] > 1]
|
||||||
|
document_fund_count_df = document_fund_count_df.sort_values(by="DistinctFundCount", ascending=False)
|
||||||
|
# Calculate percentile
|
||||||
|
percentiles = [0, 0.3, 0.6, 1]
|
||||||
|
quantile_values = document_fund_count_df['DistinctFundCount'].quantile(percentiles)
|
||||||
|
|
||||||
|
# Group by percentile
|
||||||
|
bins = [quantile_values[0], quantile_values[0.3], quantile_values[0.6], quantile_values[1]]
|
||||||
|
document_fund_count_df['Percentile_Group'] = pd.cut(document_fund_count_df['DistinctFundCount'], bins=bins, labels=["0-30", "30-60", "60-100"], include_lowest=True)
|
||||||
|
|
||||||
|
# Get relevant samples based on percentile group
|
||||||
|
percentile_result = pd.DataFrame()
|
||||||
|
for group, count in zip(["0-30", "30-60", "60-100"], [30, 30, 40]):
|
||||||
|
group_df = document_fund_count_df[document_fund_count_df['Percentile_Group'] == group]
|
||||||
|
sampled_df = group_df.sample(n=min(len(group_df), count), random_state=42)
|
||||||
|
percentile_result = pd.concat([percentile_result, sampled_df], ignore_index=True)
|
||||||
|
percentile_result.reset_index(drop=True, inplace=True)
|
||||||
|
document_id_list = percentile_result["DocumentId"].unique().tolist()
|
||||||
|
final_document_mapping_df = document_mapping_df[document_mapping_df["DocumentId"].isin(document_id_list)]
|
||||||
|
# order by DocumentId, FundLegalName, FundClassLegalName
|
||||||
|
final_document_mapping_df = final_document_mapping_df.sort_values(by=["DocumentId", "FundLegalName", "FundClassLegalName"], ascending=True)
|
||||||
|
final_document_mapping_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
# get CompanyId, CompanyName from final_document_mapping_df
|
||||||
|
final_document_provider_df = final_document_mapping_df[["CompanyId", "CompanyName"]].drop_duplicates()
|
||||||
|
# order by CompanyName
|
||||||
|
final_document_provider_df = final_document_provider_df.sort_values(by="CompanyName", ascending=True)
|
||||||
|
final_document_provider_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
final_document_data_df = document_data_df[document_data_df["DocumentId"].isin(document_id_list)]
|
||||||
|
# order by DocumentId, FundLegalName, FundClassLegalName
|
||||||
|
final_document_data_df = final_document_data_df.sort_values(by=["DocumentId", "FundLegalName", "FundClassLegalName"], ascending=True)
|
||||||
|
final_document_data_df.reset_index(drop=True, inplace=True)
|
||||||
|
|
||||||
|
output_file = os.path.join(data_folder, "aus_100_document_prospectus_multi_fund.xlsx")
|
||||||
|
with pd.ExcelWriter(output_file) as writer:
|
||||||
|
final_document_mapping_df.to_excel(
|
||||||
|
writer, sheet_name="document_mapping", index=False
|
||||||
|
)
|
||||||
|
final_document_provider_df.to_excel(
|
||||||
|
writer, sheet_name="document_provider", index=False
|
||||||
|
)
|
||||||
|
final_document_data_df.to_excel(
|
||||||
|
writer, sheet_name="aus_document_data", index=False
|
||||||
|
)
|
||||||
|
percentile_result.to_excel(
|
||||||
|
writer, sheet_name="percentile_result", index=False
|
||||||
|
)
|
||||||
|
output_sample_document_file = os.path.join(r"./sample_documents/",
|
||||||
|
"aus_prospectus_100_documents_multi_fund_sample.txt")
|
||||||
|
# output document id to txt file
|
||||||
|
with open(output_sample_document_file, "w") as f:
|
||||||
|
for doc_id in document_id_list:
|
||||||
|
f.write(f"{doc_id}\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
merge_aus_document_prospectus_data()
|
# pdf_exist()
|
||||||
|
prepare_multi_fund_aus_prospectus_document()
|
||||||
|
# merge_aus_document_prospectus_data()
|
||||||
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
||||||
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
||||||
# get_document_with_all_4_data_points(folder, file_name, None)
|
# get_document_with_all_4_data_points(folder, file_name, None)
|
||||||
|
|
@ -1447,18 +1514,19 @@ if __name__ == "__main__":
|
||||||
output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
|
output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
|
||||||
output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx"
|
output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx"
|
||||||
|
|
||||||
pdf_folder = r"/data/illume/japan_prospectus/pdf/"
|
# pdf_folder = r"/data/aus_prospectus/pdf/"
|
||||||
doc_ar_data_file_path = None
|
# doc_ar_data_file_path = None
|
||||||
doc_mapping_data_file_path = r"/data/illume/japan_prospectus/materials/document_mapping.xlsx"
|
# doc_mapping_data_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||||
output_data_folder = r"/data/illume/japan_prospectus/materials/"
|
# output_data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
|
||||||
output_file = "japan_prospectus_statistics.xlsx"
|
# output_file = "aus_100_document_prospectus_multi_fund_statistics.xlsx"
|
||||||
# statistics_document(pdf_folder=pdf_folder,
|
# statistics_document(pdf_folder=pdf_folder,
|
||||||
# doc_mapping_file_path=doc_mapping_data_file_path,
|
# doc_mapping_file_path=doc_mapping_data_file_path,
|
||||||
# doc_ar_data_file_path=doc_ar_data_file_path,
|
# doc_ar_data_file_path=doc_ar_data_file_path,
|
||||||
# mapping_sheet_name="Sheet1",
|
# mapping_sheet_name="document_mapping",
|
||||||
# ar_data_sheet_name="doc_ar_data_in_db",
|
# ar_data_sheet_name="aus_document_data",
|
||||||
# output_folder=output_data_folder,
|
# output_folder=output_data_folder,
|
||||||
# output_file=output_file)
|
# output_file=output_file)
|
||||||
|
|
||||||
# get_document_extracted_share_diff_by_db()
|
# get_document_extracted_share_diff_by_db()
|
||||||
# statistics_provider_mapping(
|
# statistics_provider_mapping(
|
||||||
# provider_mapping_data_file=provider_mapping_data_file,
|
# provider_mapping_data_file=provider_mapping_data_file,
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,100 @@
|
||||||
|
539559156
|
||||||
|
544218181
|
||||||
|
541357691
|
||||||
|
539601979
|
||||||
|
518496603
|
||||||
|
498429337
|
||||||
|
515319827
|
||||||
|
500162212
|
||||||
|
557034899
|
||||||
|
520954335
|
||||||
|
555654388
|
||||||
|
521324579
|
||||||
|
539791416
|
||||||
|
520663234
|
||||||
|
534711945
|
||||||
|
551941891
|
||||||
|
509391262
|
||||||
|
521618542
|
||||||
|
557362556
|
||||||
|
556248266
|
||||||
|
497151360
|
||||||
|
521625399
|
||||||
|
539559165
|
||||||
|
539601349
|
||||||
|
557362553
|
||||||
|
524631914
|
||||||
|
530101994
|
||||||
|
533499706
|
||||||
|
557526104
|
||||||
|
521618713
|
||||||
|
536014247
|
||||||
|
534287530
|
||||||
|
553449194
|
||||||
|
526644754
|
||||||
|
550769189
|
||||||
|
526200526
|
||||||
|
539604078
|
||||||
|
535821734
|
||||||
|
511351053
|
||||||
|
515041837
|
||||||
|
550026958
|
||||||
|
554124220
|
||||||
|
539794750
|
||||||
|
539580710
|
||||||
|
526200513
|
||||||
|
557065365
|
||||||
|
551941902
|
||||||
|
535827969
|
||||||
|
535825845
|
||||||
|
557526108
|
||||||
|
544886026
|
||||||
|
539794737
|
||||||
|
540028470
|
||||||
|
552995727
|
||||||
|
517738771
|
||||||
|
521630976
|
||||||
|
539601982
|
||||||
|
529881356
|
||||||
|
546269437
|
||||||
|
521822717
|
||||||
|
515499515
|
||||||
|
521617820
|
||||||
|
539791633
|
||||||
|
524248225
|
||||||
|
492008222
|
||||||
|
522608195
|
||||||
|
515978340
|
||||||
|
560485097
|
||||||
|
546289968
|
||||||
|
558726012
|
||||||
|
547567013
|
||||||
|
492202154
|
||||||
|
520698753
|
||||||
|
540453691
|
||||||
|
539791362
|
||||||
|
535825766
|
||||||
|
534287518
|
||||||
|
552727485
|
||||||
|
521617806
|
||||||
|
539266817
|
||||||
|
557362542
|
||||||
|
522879836
|
||||||
|
535821752
|
||||||
|
528208771
|
||||||
|
518492363
|
||||||
|
552505270
|
||||||
|
502017146
|
||||||
|
545500792
|
||||||
|
539604126
|
||||||
|
503209562
|
||||||
|
521822710
|
||||||
|
550533961
|
||||||
|
513111748
|
||||||
|
545750078
|
||||||
|
539581666
|
||||||
|
521609521
|
||||||
|
542630657
|
||||||
|
491033056
|
||||||
|
560260579
|
||||||
|
521618962
|
||||||
Loading…
Reference in New Issue