2024-08-19 20:49:45 +00:00
|
|
|
import pandas as pd
|
|
|
|
|
import os
|
2024-08-22 15:37:56 +00:00
|
|
|
from tqdm import tqdm
|
2024-08-19 20:49:45 +00:00
|
|
|
import json
|
|
|
|
|
from glob import glob
|
|
|
|
|
import fitz
|
|
|
|
|
import re
|
|
|
|
|
import time
|
2024-08-19 22:59:32 +00:00
|
|
|
import traceback
|
|
|
|
|
import json_repair
|
2025-03-07 21:02:12 +00:00
|
|
|
from copy import deepcopy
|
2024-08-19 20:49:45 +00:00
|
|
|
|
|
|
|
|
from utils.logger import logger
|
|
|
|
|
from utils.pdf_download import download_pdf_from_documents_warehouse
|
|
|
|
|
from utils.pdf_util import PDFUtil
|
2025-03-07 21:02:12 +00:00
|
|
|
from core.auz_nz.hybrid_solution_script import final_function_to_match
|
2024-08-19 20:49:45 +00:00
|
|
|
|
2024-08-22 15:37:56 +00:00
|
|
|
|
2024-08-19 20:49:45 +00:00
|
|
|
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
|
|
|
|
|
doc_provider_data = pd.read_excel(doc_provider_file_path)
|
|
|
|
|
# get new data by grouping by docid, and count the number of rows for each docid,
|
|
|
|
|
# set the new data with 2 columns: docid and provider_count
|
2024-08-22 15:37:56 +00:00
|
|
|
doc_provider_count = (
|
|
|
|
|
doc_provider_data.groupby("DocumentId")
|
|
|
|
|
.size()
|
|
|
|
|
.reset_index(name="provider_count")
|
|
|
|
|
)
|
2024-08-19 20:49:45 +00:00
|
|
|
# sort new data by provider_count in descending order
|
2024-08-22 15:37:56 +00:00
|
|
|
doc_provider_count = doc_provider_count.sort_values(
|
|
|
|
|
by="provider_count", ascending=False
|
|
|
|
|
)
|
|
|
|
|
|
2024-08-19 20:49:45 +00:00
|
|
|
# save excel by doc_provider_data and new_data
|
|
|
|
|
with pd.ExcelWriter(doc_provider_file_path) as writer:
|
2024-08-22 15:37:56 +00:00
|
|
|
doc_provider_data.to_excel(
|
|
|
|
|
writer, sheet_name="doc_provider_details", index=False
|
|
|
|
|
)
|
|
|
|
|
doc_provider_count.to_excel(
|
|
|
|
|
writer, sheet_name="doc_provider_count", index=False
|
|
|
|
|
)
|
|
|
|
|
|
2024-08-19 20:49:45 +00:00
|
|
|
|
2024-10-08 22:16:01 +00:00
|
|
|
def download_pdf(doc_provider_file_path: str,
|
|
|
|
|
sheet_name: str,
|
|
|
|
|
pdf_path: str,
|
|
|
|
|
doc_id_column: str = "DocumentId"):
|
2024-08-19 20:49:45 +00:00
|
|
|
document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name)
|
|
|
|
|
# get all unique docids as list
|
2024-08-22 15:37:56 +00:00
|
|
|
doc_id_list = [
|
2024-10-08 22:16:01 +00:00
|
|
|
str(doc_id) for doc_id in document_data[doc_id_column].unique().tolist()
|
2024-08-22 15:37:56 +00:00
|
|
|
]
|
2024-08-19 20:49:45 +00:00
|
|
|
# download pdfs
|
|
|
|
|
logger.info(f"Start downloading {len(doc_id_list)} pdfs")
|
|
|
|
|
os.makedirs(pdf_path, exist_ok=True)
|
2024-08-23 21:38:11 +00:00
|
|
|
for doc_id in tqdm(doc_id_list):
|
2024-08-19 20:49:45 +00:00
|
|
|
logger.info(f"Downloading pdf for docid: {doc_id}")
|
|
|
|
|
download_pdf_from_documents_warehouse(pdf_directory=pdf_path, doc_id=doc_id)
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def output_pdf_page_text(pdf_folder: str, output_folder: str):
|
|
|
|
|
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
|
|
|
|
|
logger.error(f"Invalid pdf_folder: {pdf_folder}")
|
|
|
|
|
return
|
|
|
|
|
if output_folder is None or len(output_folder) == 0:
|
|
|
|
|
logger.error(f"Invalid output_folder: {output_folder}")
|
|
|
|
|
return
|
2024-08-22 15:37:56 +00:00
|
|
|
|
2024-08-19 20:49:45 +00:00
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
2024-08-22 15:37:56 +00:00
|
|
|
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
2024-08-19 20:49:45 +00:00
|
|
|
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
|
|
|
|
for pdf_file in pdf_files:
|
|
|
|
|
logger.info(f"Start processing {pdf_file}")
|
|
|
|
|
pdf_util = PDFUtil(pdf_file)
|
2024-08-22 15:37:56 +00:00
|
|
|
success, text, page_text_dict = pdf_util.extract_text(
|
|
|
|
|
output_folder=output_folder
|
|
|
|
|
)
|
2024-08-19 20:49:45 +00:00
|
|
|
if success:
|
|
|
|
|
logger.info(f"Successfully extracted text from {pdf_file}")
|
|
|
|
|
|
|
|
|
|
|
2024-08-19 22:59:32 +00:00
|
|
|
def analyze_json_error():
|
|
|
|
|
text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt"
|
2024-08-22 15:37:56 +00:00
|
|
|
with open(text_file, "r", encoding="utf-8") as file:
|
2024-08-19 22:59:32 +00:00
|
|
|
text = file.read()
|
2024-08-22 15:37:56 +00:00
|
|
|
json_response = re.search(r"\`\`\`json([\s\S]*)\`\`\`", text)
|
2024-08-19 22:59:32 +00:00
|
|
|
if json_response:
|
|
|
|
|
json_text = json_response.group(1)
|
|
|
|
|
json_data = {"tables": []}
|
|
|
|
|
try:
|
|
|
|
|
json_data = json.loads(json_text)
|
|
|
|
|
except:
|
|
|
|
|
json_data = json_repair.loads(json_text)
|
2024-08-22 15:37:56 +00:00
|
|
|
table_list = json_data.get("tables", [])
|
2024-08-19 22:59:32 +00:00
|
|
|
for table_num, table in enumerate(table_list):
|
2024-08-22 15:37:56 +00:00
|
|
|
table_md_file = os.path.join("/temp/", f"temp_{table_num}.md")
|
|
|
|
|
table = re.sub(r"(\n)+", "\n", table)
|
|
|
|
|
with open(table_md_file, "w", encoding="utf-8") as file:
|
2024-08-19 22:59:32 +00:00
|
|
|
file.write(table)
|
|
|
|
|
|
|
|
|
|
|
2024-08-22 15:37:56 +00:00
|
|
|
def statistics_document(
|
2024-09-18 22:10:54 +00:00
|
|
|
pdf_folder: str,
|
|
|
|
|
doc_mapping_file_path: str,
|
2024-12-11 22:49:04 +00:00
|
|
|
doc_ar_data_file_path: str,
|
|
|
|
|
mapping_sheet_name: str = "Sheet1",
|
|
|
|
|
ar_data_sheet_name: str = "doc_ar_data_in_db",
|
2024-09-18 22:10:54 +00:00
|
|
|
output_folder: str = "/data/emea_ar/basic_information/English/",
|
|
|
|
|
output_file: str = "doc_mapping_statistics_data.xlsx"
|
2024-08-22 15:37:56 +00:00
|
|
|
):
|
|
|
|
|
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
|
|
|
|
|
logger.error(f"Invalid pdf_folder: {pdf_folder}")
|
|
|
|
|
return
|
|
|
|
|
if (
|
|
|
|
|
doc_mapping_file_path is None
|
|
|
|
|
or len(doc_mapping_file_path) == 0
|
|
|
|
|
or not os.path.exists(doc_mapping_file_path)
|
|
|
|
|
):
|
|
|
|
|
logger.error(f"Invalid doc_mapping_file_path: {doc_mapping_file_path}")
|
|
|
|
|
return
|
|
|
|
|
if output_folder is None or len(output_folder) == 0:
|
|
|
|
|
logger.error(f"Invalid output_folder: {output_folder}")
|
|
|
|
|
return
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
describe_stat_df_list = []
|
|
|
|
|
# statistics document mapping information
|
2024-12-11 22:49:04 +00:00
|
|
|
doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name=mapping_sheet_name)
|
2024-08-22 15:37:56 +00:00
|
|
|
|
|
|
|
|
# statistics doc_mapping_data for counting FundId count based on DocumentId
|
|
|
|
|
logger.info(
|
|
|
|
|
"statistics doc_mapping_data for counting FundId count based on DocumentId"
|
|
|
|
|
)
|
|
|
|
|
doc_fund_id_df = doc_mapping_data[["DocumentId", "FundId"]].drop_duplicates()
|
|
|
|
|
doc_fund_count = (
|
|
|
|
|
doc_fund_id_df.groupby("DocumentId").size().reset_index(name="fund_count")
|
|
|
|
|
)
|
|
|
|
|
# order by fund_count in descending order
|
|
|
|
|
doc_fund_count = doc_fund_count.sort_values(by="fund_count", ascending=False)
|
|
|
|
|
# statistics fund_count in doc_fund_count by describe and transform to DataFrame
|
|
|
|
|
doc_fund_count_stat_df = get_describe_stat(
|
|
|
|
|
doc_fund_count, "fund_count", "doc_fund_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(doc_fund_count_stat_df)
|
|
|
|
|
|
|
|
|
|
# statistics doc_mapping_data for counting FundClassId count based on DocumentId
|
|
|
|
|
logger.info(
|
|
|
|
|
"statistics doc_mapping_data for counting FundClassId count based on DocumentId"
|
|
|
|
|
)
|
|
|
|
|
doc_share_class_id_df = doc_mapping_data[
|
|
|
|
|
["DocumentId", "FundClassId"]
|
|
|
|
|
].drop_duplicates()
|
|
|
|
|
doc_share_class_count = (
|
|
|
|
|
doc_share_class_id_df.groupby("DocumentId")
|
|
|
|
|
.size()
|
|
|
|
|
.reset_index(name="share_class_count")
|
|
|
|
|
)
|
|
|
|
|
# order by share_class_count in descending order
|
|
|
|
|
doc_share_class_count = doc_share_class_count.sort_values(
|
|
|
|
|
by="share_class_count", ascending=False
|
|
|
|
|
)
|
|
|
|
|
# statistics share_class_count in doc_share_class_count by describe and transform to DataFrame
|
|
|
|
|
doc_share_class_count_stat_df = get_describe_stat(
|
|
|
|
|
doc_share_class_count, "share_class_count", "doc_share_class_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(doc_share_class_count_stat_df)
|
|
|
|
|
|
2024-09-18 22:10:54 +00:00
|
|
|
# statistics doc_mapping_data for counting FundId count based on CompanyId and CompanyName
|
2024-08-22 15:37:56 +00:00
|
|
|
logger.info(
|
2024-09-18 22:10:54 +00:00
|
|
|
"statistics doc_mapping_data for counting FundId count based on CompanyId and CompanyName"
|
2024-08-22 15:37:56 +00:00
|
|
|
)
|
|
|
|
|
provider_fund_id_df = doc_mapping_data[
|
2024-09-18 22:10:54 +00:00
|
|
|
["CompanyId", "CompanyName", "FundId"]
|
2024-08-22 15:37:56 +00:00
|
|
|
].drop_duplicates()
|
|
|
|
|
provider_fund_count = (
|
2024-09-18 22:10:54 +00:00
|
|
|
provider_fund_id_df.groupby(["CompanyId", "CompanyName"])
|
2024-08-22 15:37:56 +00:00
|
|
|
.size()
|
|
|
|
|
.reset_index(name="fund_count")
|
|
|
|
|
)
|
|
|
|
|
# order by fund_count in descending order
|
|
|
|
|
provider_fund_count = provider_fund_count.sort_values(
|
|
|
|
|
by="fund_count", ascending=False
|
|
|
|
|
)
|
|
|
|
|
# statistics fund_count in provider_fund_count by describe and transform to DataFrame
|
|
|
|
|
provider_fund_count_stat_df = get_describe_stat(
|
|
|
|
|
provider_fund_count, "fund_count", "provider_fund_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(provider_fund_count_stat_df)
|
|
|
|
|
|
2024-09-18 22:10:54 +00:00
|
|
|
# statistics doc_mapping_data for counting FundClassId count based on CompanyId
|
2024-08-22 15:37:56 +00:00
|
|
|
logger.info(
|
2024-09-18 22:10:54 +00:00
|
|
|
"statistics doc_mapping_data for counting FundClassId count based on CompanyId"
|
2024-08-22 15:37:56 +00:00
|
|
|
)
|
|
|
|
|
provider_share_class_id_df = doc_mapping_data[
|
2024-09-18 22:10:54 +00:00
|
|
|
["CompanyId", "CompanyName", "FundClassId"]
|
2024-08-22 15:37:56 +00:00
|
|
|
].drop_duplicates()
|
|
|
|
|
provider_share_class_count = (
|
2024-09-18 22:10:54 +00:00
|
|
|
provider_share_class_id_df.groupby(["CompanyId", "CompanyName"])
|
2024-08-22 15:37:56 +00:00
|
|
|
.size()
|
|
|
|
|
.reset_index(name="share_class_count")
|
|
|
|
|
)
|
|
|
|
|
# order by share_class_count in descending order
|
|
|
|
|
provider_share_class_count = provider_share_class_count.sort_values(
|
|
|
|
|
by="share_class_count", ascending=False
|
|
|
|
|
)
|
|
|
|
|
# statistics share_class_count in provider_share_class_count by describe and transform to DataFrame
|
|
|
|
|
provider_share_class_count_stat_df = get_describe_stat(
|
|
|
|
|
provider_share_class_count, "share_class_count", "provider_share_class_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(provider_share_class_count_stat_df)
|
|
|
|
|
|
|
|
|
|
# statistics doc_mapping_data for counting FundClassId count based on FundId and FundLegalName
|
|
|
|
|
logger.info(
|
|
|
|
|
"statistics doc_mapping_data for counting FundClassId count based on FundId and FundLegalName"
|
|
|
|
|
)
|
|
|
|
|
fund_share_class_id_df = doc_mapping_data[
|
|
|
|
|
["FundId", "FundLegalName", "FundClassId"]
|
|
|
|
|
].drop_duplicates()
|
|
|
|
|
fund_share_class_count = (
|
|
|
|
|
fund_share_class_id_df.groupby(["FundId", "FundLegalName"])
|
|
|
|
|
.size()
|
|
|
|
|
.reset_index(name="share_class_count")
|
|
|
|
|
)
|
|
|
|
|
# order by share_class_count in fund_share_class_count
|
|
|
|
|
fund_share_class_count = fund_share_class_count.sort_values(
|
|
|
|
|
by="share_class_count", ascending=False
|
|
|
|
|
)
|
|
|
|
|
# statistics share_class_count in fund_share_class_count by describe and transform to DataFrame
|
|
|
|
|
fund_share_class_count_stat_df = get_describe_stat(
|
|
|
|
|
fund_share_class_count, "share_class_count", "fund_share_class_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(fund_share_class_count_stat_df)
|
|
|
|
|
|
2024-09-18 22:10:54 +00:00
|
|
|
stat_file = os.path.join(output_folder, output_file)
|
|
|
|
|
|
|
|
|
|
doc_id_list = [str(docid) for docid in doc_mapping_data["DocumentId"].unique().tolist()]
|
2024-08-22 15:37:56 +00:00
|
|
|
# statistics document page number
|
2024-12-02 23:16:56 +00:00
|
|
|
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
|
|
|
|
|
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
|
|
|
|
|
logger.info("statistics document page number")
|
|
|
|
|
doc_page_num_list = []
|
|
|
|
|
for pdf_file in tqdm(pdf_files):
|
|
|
|
|
pdf_base_name = os.path.basename(pdf_file).replace(".pdf", "")
|
|
|
|
|
if pdf_base_name not in doc_id_list:
|
|
|
|
|
continue
|
|
|
|
|
docid = os.path.basename(pdf_file).split(".")[0]
|
|
|
|
|
doc = fitz.open(pdf_file)
|
|
|
|
|
page_num = doc.page_count
|
2024-12-11 22:49:04 +00:00
|
|
|
doc_page_num_list.append({"DocumentId": docid, "page_num": page_num})
|
2024-12-02 23:16:56 +00:00
|
|
|
doc.close()
|
|
|
|
|
doc_page_num_df = pd.DataFrame(doc_page_num_list)
|
|
|
|
|
# order by page_num in descending order
|
|
|
|
|
doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
|
|
|
|
|
# statistics page_num by describe and transform to DataFrame
|
|
|
|
|
doc_page_num_stat_df = get_describe_stat(
|
|
|
|
|
doc_page_num_df, "page_num", "doc_page_num"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(doc_page_num_stat_df)
|
2024-08-22 15:37:56 +00:00
|
|
|
|
|
|
|
|
describe_stat_df = pd.concat(describe_stat_df_list)
|
|
|
|
|
describe_stat_df.reset_index(drop=True, inplace=True)
|
2024-11-06 22:39:42 +00:00
|
|
|
|
2024-12-18 15:19:55 +00:00
|
|
|
doc_dp_data_df = None
|
|
|
|
|
if doc_ar_data_file_path is not None and os.path.exists(doc_ar_data_file_path):
|
|
|
|
|
doc_ar_data = pd.read_excel(doc_ar_data_file_path, sheet_name=ar_data_sheet_name)
|
|
|
|
|
doc_dp_result = get_document_with_all_4_data_points(None, None, doc_ar_data)
|
|
|
|
|
doc_dp_data_list = []
|
|
|
|
|
for doc_id in doc_id_list:
|
|
|
|
|
doc_id = int(doc_id)
|
|
|
|
|
doc_dp_data = {"DocumentId": doc_id, "tor": 0, "ter": 0, "ogc": 0, "perf_fee": 0}
|
|
|
|
|
if doc_id in doc_dp_result["tor"]:
|
|
|
|
|
doc_dp_data["tor"] = 1
|
|
|
|
|
if doc_id in doc_dp_result["ter"]:
|
|
|
|
|
doc_dp_data["ter"] = 1
|
|
|
|
|
if doc_id in doc_dp_result["ogc"]:
|
|
|
|
|
doc_dp_data["ogc"] = 1
|
|
|
|
|
if doc_id in doc_dp_result["perf_fee"]:
|
|
|
|
|
doc_dp_data["perf_fee"] = 1
|
|
|
|
|
doc_dp_data_list.append(doc_dp_data)
|
|
|
|
|
doc_dp_data_df = pd.DataFrame(doc_dp_data_list)
|
|
|
|
|
doc_dp_data_df = doc_dp_data_df.sort_values(by="DocumentId", ascending=True)
|
|
|
|
|
doc_dp_data_df.reset_index(drop=True, inplace=True)
|
2024-12-11 22:49:04 +00:00
|
|
|
|
|
|
|
|
# set all of DocumentId in DataFrame objects to be string type
|
|
|
|
|
doc_page_num_df["DocumentId"] = doc_page_num_df["DocumentId"].astype(str)
|
|
|
|
|
doc_fund_count["DocumentId"] = doc_fund_count["DocumentId"].astype(str)
|
|
|
|
|
doc_share_class_count["DocumentId"] = doc_share_class_count["DocumentId"].astype(str)
|
2024-12-18 15:19:55 +00:00
|
|
|
if doc_dp_data_df is not None:
|
|
|
|
|
doc_dp_data_df["DocumentId"] = doc_dp_data_df["DocumentId"].astype(str)
|
2024-12-11 22:49:04 +00:00
|
|
|
|
|
|
|
|
# merge statistics data for doc_page_num_df, doc_dp_data_df, doc_fund_count, doc_share_class_count based on DocumentId
|
|
|
|
|
doc_page_num_df = doc_page_num_df.merge(doc_fund_count, on="DocumentId", how="left")
|
|
|
|
|
doc_page_num_df = doc_page_num_df.merge(doc_share_class_count, on="DocumentId", how="left")
|
2024-12-18 15:19:55 +00:00
|
|
|
if doc_dp_data_df is not None:
|
|
|
|
|
doc_page_num_df = doc_page_num_df.merge(doc_dp_data_df, on="DocumentId", how="left")
|
2024-12-11 22:49:04 +00:00
|
|
|
|
2024-08-22 15:37:56 +00:00
|
|
|
# save statistics data to excel
|
|
|
|
|
with pd.ExcelWriter(stat_file) as writer:
|
2024-12-11 22:49:04 +00:00
|
|
|
doc_page_num_df.to_excel(writer, sheet_name="doc_level_stats", index=False)
|
|
|
|
|
# doc_dp_data_df.to_excel(writer, sheet_name="doc_dp_data", index=False)
|
|
|
|
|
# doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
|
|
|
|
|
# doc_share_class_count.to_excel(
|
|
|
|
|
# writer, sheet_name="doc_share_class_count", index=False
|
|
|
|
|
# )
|
2024-08-22 15:37:56 +00:00
|
|
|
provider_fund_count.to_excel(
|
|
|
|
|
writer, sheet_name="provider_fund_count", index=False
|
|
|
|
|
)
|
|
|
|
|
provider_share_class_count.to_excel(
|
|
|
|
|
writer, sheet_name="provider_share_class_count", index=False
|
|
|
|
|
)
|
|
|
|
|
fund_share_class_count.to_excel(
|
|
|
|
|
writer, sheet_name="fund_share_class_count", index=False
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df.to_excel(
|
|
|
|
|
writer, sheet_name="all_describe_statistics", index=False
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2024-12-11 22:49:04 +00:00
|
|
|
def get_document_with_all_4_data_points(folder: str, file_name: str, data: pd.DataFrame):
|
|
|
|
|
if data is None:
|
|
|
|
|
file_path = os.path.join(folder, file_name)
|
|
|
|
|
if os.path.exists(file_path):
|
|
|
|
|
data = pd.read_excel(file_path, sheet_name="doc_ar_data_in_db")
|
|
|
|
|
else:
|
|
|
|
|
logger.error(f"Invalid file path: {file_path}")
|
|
|
|
|
return
|
|
|
|
|
# get document id list which noTor is 0
|
|
|
|
|
noTor_0_doc_id_list = data[data["noTor"] == 0]["DocumentId"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
# get document id list which share_noTer is 0
|
|
|
|
|
share_noTer_0_doc_id_list = data[data["share_noTer"] == 0]["DocumentId"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
# get document id list which share_noOgc is 0
|
|
|
|
|
share_noOgc_0_doc_id_list = data[data["share_noOgc"] == 0]["DocumentId"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
# get document id list which share_noPerfFee is 0
|
|
|
|
|
share_noPerfFee_0_doc_id_list = data[data["share_noPerfFee"] == 0]["DocumentId"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
logger.info(f"noTor_0_doc_id_list: {len(noTor_0_doc_id_list)}")
|
|
|
|
|
logger.info(f"share_noTer_0_doc_id_list: {len(share_noTer_0_doc_id_list)}")
|
|
|
|
|
logger.info(f"share_noOgc_0_doc_id_list: {len(share_noOgc_0_doc_id_list)}")
|
|
|
|
|
logger.info(f"share_noPerfFee_0_doc_id_list: {len(share_noPerfFee_0_doc_id_list)}")
|
|
|
|
|
|
|
|
|
|
all_4_data_points_doc_id_list = list(set(noTor_0_doc_id_list) & set(share_noTer_0_doc_id_list) & set(share_noOgc_0_doc_id_list) & set(share_noPerfFee_0_doc_id_list))
|
|
|
|
|
|
|
|
|
|
logger.info(f"all_4_data_points_doc_id_list: {len(all_4_data_points_doc_id_list)}")
|
|
|
|
|
result = {"tor": noTor_0_doc_id_list,
|
|
|
|
|
"ter": share_noTer_0_doc_id_list,
|
|
|
|
|
"ogc": share_noOgc_0_doc_id_list,
|
|
|
|
|
"perf_fee": share_noPerfFee_0_doc_id_list}
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
2024-08-22 15:37:56 +00:00
|
|
|
def statistics_provider_mapping(provider_mapping_data_file: str, output_folder: str):
|
|
|
|
|
if (
|
|
|
|
|
provider_mapping_data_file is None
|
|
|
|
|
or len(provider_mapping_data_file) == 0
|
|
|
|
|
or not os.path.exists(provider_mapping_data_file)
|
|
|
|
|
):
|
|
|
|
|
logger.error(
|
|
|
|
|
f"Invalid provider_mapping_data_file: {provider_mapping_data_file}"
|
|
|
|
|
)
|
|
|
|
|
return
|
|
|
|
|
provider_mapping_data = pd.read_excel(provider_mapping_data_file)
|
|
|
|
|
|
|
|
|
|
describe_stat_df_list = []
|
|
|
|
|
# statistics provider_mapping_data for counting FundId count based on CompanyId and CompanyName
|
|
|
|
|
logger.info(
|
|
|
|
|
"statistics provider_mapping_data for counting FundId count based on CompanyId and CompanyName"
|
|
|
|
|
)
|
|
|
|
|
provider_fund_id_df = provider_mapping_data[
|
|
|
|
|
["CompanyId", "CompanyName", "FundId"]
|
|
|
|
|
].drop_duplicates()
|
|
|
|
|
provider_fund_count = (
|
|
|
|
|
provider_fund_id_df.groupby(["CompanyId", "CompanyName"])
|
|
|
|
|
.size()
|
|
|
|
|
.reset_index(name="fund_count")
|
|
|
|
|
)
|
|
|
|
|
# order by fund_count in descending order
|
|
|
|
|
provider_fund_count = provider_fund_count.sort_values(
|
|
|
|
|
by="fund_count", ascending=False
|
|
|
|
|
)
|
|
|
|
|
# statistics fund_count in provider_fund_count by describe and transform to DataFrame
|
|
|
|
|
provider_fund_count_stat_df = get_describe_stat(
|
|
|
|
|
provider_fund_count, "fund_count", "provider_fund_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(provider_fund_count_stat_df)
|
|
|
|
|
|
|
|
|
|
# Get the fund_count sum of all companies
|
|
|
|
|
all_companies_fund_count_sum = provider_fund_count["fund_count"].sum()
|
|
|
|
|
|
|
|
|
|
top_n_company_fund_count_list = []
|
|
|
|
|
# Get the fund_count sum of top 5 companies
|
|
|
|
|
top_5_companies_fund_count, top_5_companies_fund_count_percent = (
|
|
|
|
|
get_top_n_records_count(
|
|
|
|
|
provider_fund_count, "fund_count", 5, all_companies_fund_count_sum
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
top_n_company_fund_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": 5,
|
|
|
|
|
"fund_count": top_5_companies_fund_count,
|
|
|
|
|
"percent": top_5_companies_fund_count_percent,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
logger.info(f"Top 5 companies fund count sum: {top_5_companies_fund_count}")
|
|
|
|
|
# Get the fund_count sum of top 10 companies
|
|
|
|
|
top_10_companies_fund_count, top_10_companies_fund_count_percent = (
|
|
|
|
|
get_top_n_records_count(
|
|
|
|
|
provider_fund_count, "fund_count", 10, all_companies_fund_count_sum
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
top_n_company_fund_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": 10,
|
|
|
|
|
"fund_count": top_10_companies_fund_count,
|
|
|
|
|
"percent": top_10_companies_fund_count_percent,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
logger.info(f"Top 10 companies fund count sum: {top_10_companies_fund_count}")
|
|
|
|
|
|
|
|
|
|
# Get the fund_count sum of top 50 companies
|
|
|
|
|
top_50_companies_fund_count, top_50_companies_fund_count_percent = (
|
|
|
|
|
get_top_n_records_count(
|
|
|
|
|
provider_fund_count, "fund_count", 50, all_companies_fund_count_sum
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
top_n_company_fund_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": 50,
|
|
|
|
|
"fund_count": top_50_companies_fund_count,
|
|
|
|
|
"percent": top_50_companies_fund_count_percent,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
logger.info(f"Top 50 companies fund count sum: {top_50_companies_fund_count}")
|
|
|
|
|
|
|
|
|
|
# Get the fund_count sum of top 100 companies
|
|
|
|
|
top_100_companies_fund_count, top_100_companies_fund_count_percent = (
|
|
|
|
|
get_top_n_records_count(
|
|
|
|
|
provider_fund_count, "fund_count", 100, all_companies_fund_count_sum
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
top_n_company_fund_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": 100,
|
|
|
|
|
"fund_count": top_100_companies_fund_count,
|
|
|
|
|
"percent": top_100_companies_fund_count_percent,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
top_n_company_fund_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": len(provider_fund_count),
|
|
|
|
|
"fund_count": all_companies_fund_count_sum,
|
|
|
|
|
"percent": 100,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
logger.info(f"Top 100 companies fund count sum: {top_100_companies_fund_count}")
|
|
|
|
|
top_n_company_fund_count_df = pd.DataFrame(top_n_company_fund_count_list)
|
|
|
|
|
|
|
|
|
|
# statistics provider_mapping_data for counting FundClassId count based on CompanyId and CompanyName
|
|
|
|
|
logger.info(
|
|
|
|
|
"statistics provider_mapping_data for counting SecId count based on CompanyId and CompanyName"
|
|
|
|
|
)
|
|
|
|
|
provider_share_class_id_df = provider_mapping_data[
|
|
|
|
|
["CompanyId", "CompanyName", "SecId"]
|
|
|
|
|
].drop_duplicates()
|
|
|
|
|
provider_share_class_count = (
|
|
|
|
|
provider_share_class_id_df.groupby(["CompanyId", "CompanyName"])
|
|
|
|
|
.size()
|
|
|
|
|
.reset_index(name="share_class_count")
|
|
|
|
|
)
|
|
|
|
|
# order by share_class_count in descending order
|
|
|
|
|
provider_share_class_count = provider_share_class_count.sort_values(
|
|
|
|
|
by="share_class_count", ascending=False
|
|
|
|
|
)
|
|
|
|
|
# statistics share_class_count in provider_share_class_count by describe and transform to DataFrame
|
|
|
|
|
provider_share_class_count_stat_df = get_describe_stat(
|
|
|
|
|
provider_share_class_count, "share_class_count", "provider_share_class_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(provider_share_class_count_stat_df)
|
|
|
|
|
|
|
|
|
|
# Get the fund_count sum of all companies
|
|
|
|
|
all_companies_share_class_count_sum = provider_share_class_count[
|
|
|
|
|
"share_class_count"
|
|
|
|
|
].sum()
|
|
|
|
|
|
|
|
|
|
top_n_company_share_class_count_list = []
|
|
|
|
|
# Get the fund_count sum of top 5 companies
|
|
|
|
|
top_5_companies_share_class_count, top_5_companies_share_class_count_percent = (
|
|
|
|
|
get_top_n_records_count(
|
|
|
|
|
provider_share_class_count,
|
|
|
|
|
"share_class_count",
|
|
|
|
|
5,
|
|
|
|
|
all_companies_share_class_count_sum,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
top_n_company_share_class_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": 5,
|
|
|
|
|
"share_class_count": top_5_companies_share_class_count,
|
|
|
|
|
"percent": top_5_companies_share_class_count_percent,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Top 5 companies share class count sum: {top_5_companies_share_class_count}"
|
|
|
|
|
)
|
|
|
|
|
# Get the fund_count sum of top 10 companies
|
|
|
|
|
top_10_companies_share_class_count, top_10_companies_share_class_count_percent = (
|
|
|
|
|
get_top_n_records_count(
|
|
|
|
|
provider_share_class_count,
|
|
|
|
|
"share_class_count",
|
|
|
|
|
10,
|
|
|
|
|
all_companies_share_class_count_sum,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
top_n_company_share_class_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": 10,
|
|
|
|
|
"share_class_count": top_10_companies_share_class_count,
|
|
|
|
|
"percent": top_10_companies_share_class_count_percent,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Top 10 companies share class count sum: {top_10_companies_share_class_count}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Get the fund_count sum of top 50 companies
|
|
|
|
|
top_50_companies_share_class_count, top_50_companies_share_class_count_percent = (
|
|
|
|
|
get_top_n_records_count(
|
|
|
|
|
provider_share_class_count,
|
|
|
|
|
"share_class_count",
|
|
|
|
|
50,
|
|
|
|
|
all_companies_share_class_count_sum,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
top_n_company_share_class_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": 50,
|
|
|
|
|
"share_class_count": top_50_companies_share_class_count,
|
|
|
|
|
"percent": top_50_companies_share_class_count_percent,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Top 50 companies share class count sum: {top_50_companies_share_class_count}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Get the fund_count sum of top 100 companies
|
|
|
|
|
top_100_companies_share_class_count, top_100_companies_share_class_count_percent = (
|
|
|
|
|
get_top_n_records_count(
|
|
|
|
|
provider_share_class_count,
|
|
|
|
|
"share_class_count",
|
|
|
|
|
100,
|
|
|
|
|
all_companies_share_class_count_sum,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
top_n_company_share_class_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": 100,
|
|
|
|
|
"share_class_count": top_100_companies_share_class_count,
|
|
|
|
|
"percent": top_100_companies_share_class_count_percent,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Top 100 companies share class count sum: {top_100_companies_share_class_count}"
|
|
|
|
|
)
|
|
|
|
|
top_n_company_share_class_count_list.append(
|
|
|
|
|
{
|
|
|
|
|
"top_n_providers": len(provider_share_class_count),
|
|
|
|
|
"share_class_count": all_companies_share_class_count_sum,
|
|
|
|
|
"percent": 100,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
top_n_company_share_class_count_df = pd.DataFrame(
|
|
|
|
|
top_n_company_share_class_count_list
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# statistics provider_mapping_data for counting SecId count based on FundId and FundLegalName
|
|
|
|
|
logger.info(
|
|
|
|
|
"statistics provider_mapping_data for counting SecId count based on FundId and FundLegalName"
|
|
|
|
|
)
|
|
|
|
|
fund_share_class_id_df = provider_mapping_data[
|
|
|
|
|
["FundId", "FundLegalName", "SecId"]
|
|
|
|
|
].drop_duplicates()
|
|
|
|
|
fund_share_class_count = (
|
|
|
|
|
fund_share_class_id_df.groupby(["FundId", "FundLegalName"])
|
|
|
|
|
.size()
|
|
|
|
|
.reset_index(name="share_class_count")
|
|
|
|
|
)
|
|
|
|
|
# order by share_class_count in fund_share_class_count
|
|
|
|
|
fund_share_class_count = fund_share_class_count.sort_values(
|
|
|
|
|
by="share_class_count", ascending=False
|
|
|
|
|
)
|
|
|
|
|
# statistics share_class_count in fund_share_class_count by describe and transform to DataFrame
|
|
|
|
|
fund_share_class_count_stat_df = get_describe_stat(
|
|
|
|
|
fund_share_class_count, "share_class_count", "fund_share_class_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(fund_share_class_count_stat_df)
|
|
|
|
|
describe_stat_df = pd.concat(describe_stat_df_list)
|
|
|
|
|
describe_stat_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
stat_file = os.path.join(output_folder, "provider_mapping_data_statistics.xlsx")
|
|
|
|
|
# save statistics data to excel
|
|
|
|
|
with pd.ExcelWriter(stat_file) as writer:
|
|
|
|
|
top_n_company_fund_count_df.to_excel(
|
|
|
|
|
writer, sheet_name="top_n_provider_fund_count", index=False
|
|
|
|
|
)
|
|
|
|
|
top_n_company_share_class_count_df.to_excel(
|
|
|
|
|
writer, sheet_name="top_n_provider_share_count", index=False
|
|
|
|
|
)
|
|
|
|
|
provider_fund_count.to_excel(
|
|
|
|
|
writer, sheet_name="provider_fund_count", index=False
|
|
|
|
|
)
|
|
|
|
|
provider_share_class_count.to_excel(
|
|
|
|
|
writer, sheet_name="provider_share_count", index=False
|
|
|
|
|
)
|
|
|
|
|
fund_share_class_count.to_excel(
|
|
|
|
|
writer, sheet_name="fund_share_count", index=False
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df.to_excel(
|
|
|
|
|
writer, sheet_name="all_describe_statistics", index=False
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def statistics_document_fund_share_count(provider_mapping_data_file: str):
|
|
|
|
|
if (
|
|
|
|
|
provider_mapping_data_file is None
|
|
|
|
|
or len(provider_mapping_data_file) == 0
|
|
|
|
|
or not os.path.exists(provider_mapping_data_file)
|
|
|
|
|
):
|
|
|
|
|
logger.error(f"Invalid file_path: {provider_mapping_data_file}")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
describe_stat_df_list = []
|
|
|
|
|
# statistics document mapping information
|
|
|
|
|
doc_mapping_data = pd.read_excel(provider_mapping_data_file, sheet_name="all_data")
|
2024-08-23 21:38:11 +00:00
|
|
|
# set noTor column value to 0 if column tor value is not nan, set 1 otherwise
|
|
|
|
|
doc_mapping_data["noTor"] = doc_mapping_data["tor"].apply(
|
|
|
|
|
lambda x: 0 if pd.notna(x) else 1
|
|
|
|
|
)
|
|
|
|
|
# set share_noTer column value to 0 if column share_ter value is not nan, set 1 otherwise
|
|
|
|
|
doc_mapping_data["share_noTer"] = doc_mapping_data["share_ter"].apply(
|
|
|
|
|
lambda x: 0 if pd.notna(x) else 1
|
|
|
|
|
)
|
|
|
|
|
# set share_noOgc column value to 0 if column share_ter value is not nan, set 1 otherwise
|
|
|
|
|
doc_mapping_data["share_noOgc"] = doc_mapping_data["share_ogc"].apply(
|
|
|
|
|
lambda x: 0 if pd.notna(x) else 1
|
|
|
|
|
)
|
|
|
|
|
# set share_noPerfFee column value to 0 if column share_ter value is not nan, set 1 otherwise
|
|
|
|
|
doc_mapping_data["share_noPerfFee"] = doc_mapping_data["share_perfFee"].apply(
|
|
|
|
|
lambda x: 0 if pd.notna(x) else 1
|
|
|
|
|
)
|
2024-08-22 15:37:56 +00:00
|
|
|
|
|
|
|
|
# statistics doc_mapping_data for counting FundId count based on DocumentId
|
|
|
|
|
logger.info(
|
|
|
|
|
"statistics doc_mapping_data for counting FundId count based on DocumentId"
|
|
|
|
|
)
|
2024-08-23 21:38:11 +00:00
|
|
|
doc_fund_id_df = doc_mapping_data[["DocumentId", "EffectiveDate", "CompanyId", "CompanyName", "FundId"]].drop_duplicates()
|
2024-08-22 15:37:56 +00:00
|
|
|
doc_fund_count = (
|
2024-08-23 21:38:11 +00:00
|
|
|
doc_fund_id_df.groupby(["DocumentId", "EffectiveDate", "CompanyId", "CompanyName"]).size().reset_index(name="fund_count")
|
2024-08-22 15:37:56 +00:00
|
|
|
)
|
|
|
|
|
# order by fund_count in descending order
|
|
|
|
|
doc_fund_count = doc_fund_count.sort_values(by="fund_count", ascending=True)
|
2024-08-23 21:38:11 +00:00
|
|
|
|
|
|
|
|
# set with_ar_data to True if noTor == 0 or share_noOgc == 0 or share_noPerfFee == 0
|
|
|
|
|
doc_fund_count["with_ar_data"] = False
|
|
|
|
|
for index, row in doc_fund_count.iterrows():
|
|
|
|
|
document_id = row["DocumentId"]
|
|
|
|
|
ar_data = doc_mapping_data[
|
|
|
|
|
(doc_mapping_data["DocumentId"] == document_id)
|
|
|
|
|
& (
|
|
|
|
|
(
|
|
|
|
|
(doc_mapping_data["noTor"] == 0)
|
|
|
|
|
| (doc_mapping_data["share_noTer"] == 0)
|
|
|
|
|
| (doc_mapping_data["share_noOgc"] == 0)
|
|
|
|
|
| (doc_mapping_data["share_noPerfFee"] == 0)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
if len(ar_data) > 0:
|
|
|
|
|
doc_fund_count.loc[index, "with_ar_data"] = True
|
|
|
|
|
|
|
|
|
|
|
2024-08-22 15:37:56 +00:00
|
|
|
# statistics fund_count in doc_fund_count by describe and transform to DataFrame
|
|
|
|
|
doc_fund_count_stat_df = get_describe_stat(
|
|
|
|
|
doc_fund_count, "fund_count", "doc_fund_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(doc_fund_count_stat_df)
|
|
|
|
|
|
|
|
|
|
# statistics doc_mapping_data for counting FundClassId count based on DocumentId
|
|
|
|
|
logger.info(
|
|
|
|
|
"statistics doc_mapping_data for counting FundClassId count based on DocumentId"
|
|
|
|
|
)
|
|
|
|
|
doc_share_class_id_df = doc_mapping_data[
|
2024-08-23 21:38:11 +00:00
|
|
|
["DocumentId", "EffectiveDate", "CompanyId", "CompanyName", "FundClassId"]
|
2024-08-22 15:37:56 +00:00
|
|
|
].drop_duplicates()
|
|
|
|
|
doc_share_class_count = (
|
2024-08-23 21:38:11 +00:00
|
|
|
doc_share_class_id_df.groupby(["DocumentId", "EffectiveDate", "CompanyId", "CompanyName"])
|
2024-08-22 15:37:56 +00:00
|
|
|
.size()
|
|
|
|
|
.reset_index(name="share_class_count")
|
|
|
|
|
)
|
|
|
|
|
# order by share_class_count in descending order
|
|
|
|
|
doc_share_class_count = doc_share_class_count.sort_values(
|
|
|
|
|
by="share_class_count", ascending=True
|
|
|
|
|
)
|
2024-08-23 21:38:11 +00:00
|
|
|
# set with_ar_data to True if noTor == 0 or share_noOgc == 0 or share_noPerfFee == 0
|
|
|
|
|
doc_share_class_count["with_ar_data"] = False
|
|
|
|
|
for index, row in doc_share_class_count.iterrows():
|
|
|
|
|
document_id = row["DocumentId"]
|
|
|
|
|
ar_data = doc_mapping_data[
|
|
|
|
|
(doc_mapping_data["DocumentId"] == document_id)
|
|
|
|
|
& (
|
|
|
|
|
(
|
|
|
|
|
(doc_mapping_data["noTor"] == 0)
|
|
|
|
|
| (doc_mapping_data["share_noTer"] == 0)
|
|
|
|
|
| (doc_mapping_data["share_noOgc"] == 0)
|
|
|
|
|
| (doc_mapping_data["share_noPerfFee"] == 0)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
if len(ar_data) > 0:
|
|
|
|
|
doc_share_class_count.loc[index, "with_ar_data"] = True
|
|
|
|
|
|
2024-08-22 15:37:56 +00:00
|
|
|
# statistics share_class_count in doc_share_class_count by describe and transform to DataFrame
|
|
|
|
|
doc_share_class_count_stat_df = get_describe_stat(
|
|
|
|
|
doc_share_class_count, "share_class_count", "doc_share_class_count"
|
|
|
|
|
)
|
|
|
|
|
describe_stat_df_list.append(doc_share_class_count_stat_df)
|
|
|
|
|
|
|
|
|
|
describe_stat_df = pd.concat(describe_stat_df_list)
|
|
|
|
|
describe_stat_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
with pd.ExcelWriter(provider_mapping_data_file) as writer:
|
|
|
|
|
doc_mapping_data.to_excel(writer, sheet_name="all_data", index=False)
|
|
|
|
|
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
|
|
|
|
|
doc_share_class_count.to_excel(writer, sheet_name="doc_share_class_count", index=False)
|
|
|
|
|
describe_stat_df.to_excel(writer, sheet_name="all_describe_statistics", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_top_n_records_count(
|
|
|
|
|
df: pd.DataFrame, column_name: str, n: int, total_count: int
|
|
|
|
|
):
|
|
|
|
|
top_n_records = df.head(n)
|
|
|
|
|
top_n_records_count = top_n_records[column_name].sum()
|
|
|
|
|
top_n_records_count_percent = round((top_n_records_count / total_count) * 100, 2)
|
|
|
|
|
return top_n_records_count, top_n_records_count_percent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_describe_stat(df: pd.DataFrame, column_name: str, stat_type_name: str):
|
|
|
|
|
stat_df = df[column_name].describe().reset_index().T
|
|
|
|
|
stat_df.columns = ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
|
|
|
|
|
stat_df.reset_index(inplace=True)
|
|
|
|
|
stat_df.rename(columns={"index": "Stat"}, inplace=True)
|
|
|
|
|
# remove the first row
|
|
|
|
|
stat_df = stat_df[1:]
|
|
|
|
|
if stat_type_name is not None:
|
|
|
|
|
stat_df["Stat_Type"] = stat_type_name
|
|
|
|
|
stat_df = stat_df[
|
|
|
|
|
[
|
|
|
|
|
"Stat_Type",
|
|
|
|
|
"count",
|
|
|
|
|
"mean",
|
|
|
|
|
"std",
|
|
|
|
|
"min",
|
|
|
|
|
"25%",
|
|
|
|
|
"50%",
|
|
|
|
|
"75%",
|
|
|
|
|
"max",
|
|
|
|
|
]
|
|
|
|
|
]
|
|
|
|
|
return stat_df
|
|
|
|
|
|
|
|
|
|
|
2024-08-23 21:38:11 +00:00
|
|
|
def pickup_document_from_top_100_providers():
|
|
|
|
|
"""
|
|
|
|
|
Pickup 100 documents from top 100 providers.
|
|
|
|
|
The documents are with less 10 share classes.
|
|
|
|
|
The purpose is to analyze the document structure and content from small documents.
|
|
|
|
|
"""
|
|
|
|
|
provider_mapping_data_file = (
|
|
|
|
|
r"/data/emea_ar/basic_information/English/provider_mapping_data_statistics.xlsx"
|
|
|
|
|
)
|
|
|
|
|
top_100_provider_document_file = (
|
|
|
|
|
r"/data/emea_ar/basic_information/English/lux_english_ar_from_top_100_provider_since_2020.xlsx"
|
|
|
|
|
)
|
|
|
|
|
provider_share_count = pd.read_excel(
|
|
|
|
|
provider_mapping_data_file, sheet_name="provider_share_count"
|
|
|
|
|
)
|
|
|
|
|
# add a new column with name share_count_rank to provider_share_count
|
|
|
|
|
provider_share_count["share_count_rank"] = provider_share_count[
|
|
|
|
|
"share_class_count"
|
|
|
|
|
].rank(method="min", ascending=False)
|
|
|
|
|
|
|
|
|
|
top_100_provider_document_all_data = pd.read_excel(
|
|
|
|
|
top_100_provider_document_file, sheet_name="all_data"
|
|
|
|
|
)
|
|
|
|
|
|
2024-09-06 21:29:35 +00:00
|
|
|
top_100_provider_document_fund_count = pd.read_excel(
|
|
|
|
|
top_100_provider_document_file, sheet_name="doc_fund_count"
|
|
|
|
|
)
|
|
|
|
|
top_100_provider_document_fund_count.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
2024-08-23 21:38:11 +00:00
|
|
|
top_100_provider_document_share_count = pd.read_excel(
|
|
|
|
|
top_100_provider_document_file, sheet_name="doc_share_class_count"
|
|
|
|
|
)
|
|
|
|
|
top_100_provider_document_share_count = \
|
|
|
|
|
top_100_provider_document_share_count[top_100_provider_document_share_count["with_ar_data"] == True]
|
|
|
|
|
top_100_provider_document_share_count.reset_index(drop=True, inplace=True)
|
2024-09-06 21:29:35 +00:00
|
|
|
|
|
|
|
|
top_100_provider_document_share_count = pd.merge(
|
|
|
|
|
top_100_provider_document_share_count,
|
|
|
|
|
top_100_provider_document_fund_count,
|
|
|
|
|
on=["DocumentId"],
|
|
|
|
|
how="left",
|
|
|
|
|
)
|
|
|
|
|
top_100_provider_document_share_count = top_100_provider_document_share_count[
|
|
|
|
|
["DocumentId", "CompanyId_x", "CompanyName_x", "fund_count", "share_class_count"]
|
|
|
|
|
]
|
|
|
|
|
top_100_provider_document_share_count.rename(
|
|
|
|
|
columns={"CompanyId_x": "CompanyId"}, inplace=True
|
|
|
|
|
)
|
|
|
|
|
|
2024-08-23 21:38:11 +00:00
|
|
|
# add a new column with name share_count_rank to top_100_provider_document_share_count by merge with provider_share_count
|
|
|
|
|
top_100_provider_document_share_count = pd.merge(
|
|
|
|
|
top_100_provider_document_share_count,
|
|
|
|
|
provider_share_count,
|
|
|
|
|
on=["CompanyId"],
|
|
|
|
|
how="left",
|
|
|
|
|
)
|
|
|
|
|
# Keep columns: DocumentId, CompanyId, CompanyName, share_class_count_x, share_count_rank
|
|
|
|
|
top_100_provider_document_share_count = top_100_provider_document_share_count[
|
2024-09-06 21:29:35 +00:00
|
|
|
["DocumentId", "CompanyId", "CompanyName", "fund_count", "share_class_count_x", "share_count_rank"]
|
2024-08-23 21:38:11 +00:00
|
|
|
]
|
|
|
|
|
# rename column share_class_count_x to share_class_count
|
|
|
|
|
top_100_provider_document_share_count.rename(
|
|
|
|
|
columns={"share_class_count_x": "share_class_count",
|
|
|
|
|
"share_count_rank": "provider_share_count_rank"}, inplace=True
|
|
|
|
|
)
|
|
|
|
|
top_100_provider_document_share_count = top_100_provider_document_share_count.sort_values(
|
|
|
|
|
by=["provider_share_count_rank", "share_class_count"], ascending=True
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# According to share_count_rank, from 1 to 10,
|
|
|
|
|
# random pickup one documents with 1 to 10 share classes for each rank
|
|
|
|
|
data_filter = top_100_provider_document_share_count[
|
|
|
|
|
(top_100_provider_document_share_count["share_class_count"] <= 10)
|
|
|
|
|
& (top_100_provider_document_share_count["share_class_count"] >= 1)
|
|
|
|
|
]
|
|
|
|
|
data_filter = data_filter.sort_values(
|
|
|
|
|
by=["provider_share_count_rank", "share_class_count"], ascending=[True, True]
|
|
|
|
|
)
|
|
|
|
|
unique_rank_list = top_100_provider_document_share_count["provider_share_count_rank"].unique().tolist()
|
|
|
|
|
random_pickup_document_data_list = []
|
|
|
|
|
for rank in unique_rank_list:
|
|
|
|
|
data_filter_rank = data_filter[data_filter["provider_share_count_rank"] == rank]
|
|
|
|
|
if len(data_filter_rank) == 0:
|
|
|
|
|
# get the first document with rank from top_100_provider_document_share_count
|
|
|
|
|
data_filter_rank = top_100_provider_document_share_count[
|
|
|
|
|
top_100_provider_document_share_count["provider_share_count_rank"] == rank
|
|
|
|
|
].head(1)
|
|
|
|
|
data_filter_rank = data_filter_rank.sample(n=1, random_state=88)
|
|
|
|
|
random_pickup_document_data_list.append(data_filter_rank)
|
|
|
|
|
random_pickup_document_data = pd.concat(random_pickup_document_data_list)
|
|
|
|
|
# sort by share_count_rank in ascending order
|
|
|
|
|
random_pickup_document_data = random_pickup_document_data.sort_values(
|
|
|
|
|
by="provider_share_count_rank", ascending=True
|
|
|
|
|
)
|
|
|
|
|
random_pickup_document_data.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
random_pickup_document_mini_data = random_pickup_document_data[
|
|
|
|
|
["DocumentId", "provider_share_count_rank"]
|
|
|
|
|
]
|
|
|
|
|
# get all data from top_100_provider_document_all_data by merge with random_pickup_document_mini_data
|
|
|
|
|
random_pickup_document_all_data = pd.merge(
|
|
|
|
|
random_pickup_document_mini_data,
|
|
|
|
|
top_100_provider_document_all_data,
|
|
|
|
|
on=["DocumentId"],
|
|
|
|
|
how="left",
|
|
|
|
|
)
|
|
|
|
|
# sort random_pickup_document_all_data by provider_share_count_rank, FundLegalName, FundClassLegalName in ascending order
|
|
|
|
|
random_pickup_document_all_data = random_pickup_document_all_data.sort_values(
|
|
|
|
|
by=["provider_share_count_rank", "FundLegalName", "FundClassLegalName"], ascending=True
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
random_small_document_data_file = (
|
|
|
|
|
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
|
|
|
|
)
|
|
|
|
|
with pd.ExcelWriter(random_small_document_data_file) as writer:
|
|
|
|
|
top_100_provider_document_share_count.to_excel(
|
|
|
|
|
writer, sheet_name="all_doc_with_ar_data", index=False
|
|
|
|
|
)
|
|
|
|
|
random_pickup_document_data.to_excel(
|
|
|
|
|
writer, sheet_name="random_small_document", index=False
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
random_pickup_document_all_data.to_excel(
|
|
|
|
|
writer, sheet_name="random_small_document_all_data", index=False
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2024-09-18 22:10:54 +00:00
|
|
|
def compare_records_count_by_document_id():
|
|
|
|
|
data_from_document = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
|
|
|
|
sheet_name = "mapping_data"
|
|
|
|
|
data_from_document_df = pd.read_excel(data_from_document, sheet_name=sheet_name)
|
|
|
|
|
data_from_document_df.rename(
|
|
|
|
|
columns={"doc_id": "DocumentId"}, inplace=True
|
|
|
|
|
)
|
|
|
|
|
# get the count of records by DocumentId
|
|
|
|
|
document_records_count = data_from_document_df.groupby("DocumentId").size().reset_index(name="records_count")
|
|
|
|
|
|
2024-09-26 17:18:37 +00:00
|
|
|
data_from_database = r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document_from_DocumentAcquisition.xlsx"
|
2024-09-18 22:10:54 +00:00
|
|
|
sheet_name = "random_small_document_all_data"
|
|
|
|
|
data_from_database_df = pd.read_excel(data_from_database, sheet_name=sheet_name)
|
|
|
|
|
database_records_count = data_from_database_df.groupby("DocumentId").size().reset_index(name="records_count")
|
|
|
|
|
|
|
|
|
|
# merge document_records_count with database_records_count
|
|
|
|
|
records_count_compare = pd.merge(
|
|
|
|
|
document_records_count,
|
|
|
|
|
database_records_count,
|
|
|
|
|
on=["DocumentId"],
|
|
|
|
|
how="left",
|
|
|
|
|
)
|
|
|
|
|
records_count_compare["records_count_diff"] = records_count_compare["records_count_x"] - records_count_compare["records_count_y"]
|
|
|
|
|
records_count_compare = records_count_compare.sort_values(by="records_count_diff", ascending=False)
|
|
|
|
|
# rename records_count_x to records_count_document, records_count_y to records_count_database
|
|
|
|
|
records_count_compare.rename(
|
|
|
|
|
columns={"records_count_x": "records_count_document",
|
|
|
|
|
"records_count_y": "records_count_database"}, inplace=True
|
|
|
|
|
)
|
|
|
|
|
records_count_compare.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
records_count_compare_file = (
|
2024-09-26 17:18:37 +00:00
|
|
|
r"/data/emea_ar/basic_information/English/records_count_compare_between_document_database_from_DocumentAcquisition.xlsx"
|
2024-09-18 22:10:54 +00:00
|
|
|
)
|
|
|
|
|
with pd.ExcelWriter(records_count_compare_file) as writer:
|
|
|
|
|
records_count_compare.to_excel(
|
|
|
|
|
writer, sheet_name="records_count_compare", index=False
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2024-09-23 22:21:02 +00:00
|
|
|
def get_document_extracted_share_diff_by_db():
|
|
|
|
|
db_data_file = r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document_from_DocumentAcquisition.xlsx"
|
|
|
|
|
extract_data_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
|
|
|
|
|
|
|
|
|
|
doc_mapping_folder = r"/data/emea_ar/output/mapping/document/"
|
|
|
|
|
db_data = pd.read_excel(db_data_file, sheet_name="Sheet1")
|
|
|
|
|
extract_data = pd.read_excel(extract_data_file, sheet_name="mapping_data")
|
|
|
|
|
# only get data which investment_type is 1
|
2024-09-26 17:18:37 +00:00
|
|
|
# extract_data = extract_data[extract_data["investment_type"] == 1]
|
2024-09-23 22:21:02 +00:00
|
|
|
extract_data.reset_index(drop=True, inplace=True)
|
|
|
|
|
unique_doc_id = extract_data["doc_id"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
status_info = {
|
|
|
|
|
1: "WIP",
|
|
|
|
|
5: "Junked",
|
|
|
|
|
3: "AutoSignoff",
|
|
|
|
|
2: "Signoffed",
|
|
|
|
|
10: "Complete",
|
|
|
|
|
20: "AutoDetect",
|
|
|
|
|
21: "Checkduplicate",
|
|
|
|
|
22: "Mapping",
|
|
|
|
|
33: "Not Matched",
|
|
|
|
|
99: "Unknown",
|
|
|
|
|
}
|
|
|
|
|
document_extract_db_compare = []
|
|
|
|
|
for doc_id in unique_doc_id:
|
|
|
|
|
doc_mapping_file = os.path.join(doc_mapping_folder, f"{doc_id}.xlsx")
|
|
|
|
|
if not os.path.exists(doc_mapping_file):
|
|
|
|
|
logger.error(f"Invalid mapping_file: {doc_mapping_file}")
|
|
|
|
|
doc_mapping_share_class_id_df = pd.DataFrame()
|
|
|
|
|
else:
|
|
|
|
|
doc_mapping_data = pd.read_excel(doc_mapping_file)
|
|
|
|
|
doc_mapping_share_class_id_df = doc_mapping_data[["SecId"]].drop_duplicates()
|
|
|
|
|
|
|
|
|
|
ar_db_data_doc = db_data[db_data["DocumentId"] == doc_id]
|
|
|
|
|
try:
|
|
|
|
|
masterProcess_status = ar_db_data_doc["MasterProcess_Status"].values[0]
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error: {e}")
|
|
|
|
|
masterProcess_status = 99
|
|
|
|
|
masterProcess_status = int(masterProcess_status)
|
|
|
|
|
masterProcess_status_defination = status_info.get(masterProcess_status, "Unknown")
|
|
|
|
|
# get data from ar_db_data_doc which noTor == 0 or share_noOgc == 0 or share_noPerfFee == 0
|
|
|
|
|
ar_db_data_doc = ar_db_data_doc[
|
|
|
|
|
(ar_db_data_doc["noTor"] == 0)
|
|
|
|
|
| (ar_db_data_doc["share_noTer"] == 0)
|
|
|
|
|
| (ar_db_data_doc["share_noOgc"] == 0)
|
|
|
|
|
| (ar_db_data_doc["share_noPerfFee"] == 0)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
extract_data_doc = extract_data[extract_data["doc_id"] == doc_id]
|
|
|
|
|
# unique raw_name in extract_data_doc
|
|
|
|
|
unique_raw_name = extract_data_doc["raw_name"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
doc_mapping_share_class_count = len(doc_mapping_share_class_id_df)
|
|
|
|
|
extract_share_class_count = len(unique_raw_name)
|
|
|
|
|
extract_vs_doc_share_count_diff = extract_share_class_count - doc_mapping_share_class_count
|
|
|
|
|
db_share_class_count = len(ar_db_data_doc)
|
|
|
|
|
extract_vs_ar_db_share_count_diff = extract_share_class_count - db_share_class_count
|
|
|
|
|
document_extract_db_compare.append({
|
|
|
|
|
"DocumentId": doc_id,
|
|
|
|
|
"status": masterProcess_status,
|
|
|
|
|
"status_defination": masterProcess_status_defination,
|
|
|
|
|
"extract_share_count": extract_share_class_count,
|
|
|
|
|
"doc_share_count": doc_mapping_share_class_count,
|
|
|
|
|
"extract_vs_doc_share_count_diff": extract_vs_doc_share_count_diff,
|
|
|
|
|
"ar_db_share_count": db_share_class_count,
|
|
|
|
|
"extract_vs_ar_db_share_count_diff": extract_vs_ar_db_share_count_diff,
|
|
|
|
|
})
|
|
|
|
|
document_extract_db_compare_df = pd.DataFrame(document_extract_db_compare)
|
|
|
|
|
# output to excel
|
|
|
|
|
document_extract_db_compare_file = (
|
|
|
|
|
r"/data/emea_ar/basic_information/English/document_extract_db_compare.xlsx"
|
|
|
|
|
)
|
|
|
|
|
with pd.ExcelWriter(document_extract_db_compare_file) as writer:
|
|
|
|
|
document_extract_db_compare_df.to_excel(
|
|
|
|
|
writer, sheet_name="document_extract_db_compare", index=False
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def concat_mapping(mapping_folder: str,
|
|
|
|
|
output_file: str):
|
|
|
|
|
excel_files = glob(os.path.join(mapping_folder, "*.xlsx"))
|
|
|
|
|
logger.info(f"Total {len(excel_files)} excel files found in {mapping_folder}")
|
|
|
|
|
all_data_list = []
|
|
|
|
|
for excel_file in excel_files:
|
|
|
|
|
doc_mapping_data = pd.read_excel(excel_file)
|
|
|
|
|
all_data_list.append(doc_mapping_data)
|
|
|
|
|
all_data = pd.concat(all_data_list)
|
|
|
|
|
all_data.reset_index(drop=True, inplace=True)
|
|
|
|
|
with open(output_file, "wb") as f:
|
|
|
|
|
all_data.to_excel(f, index=False)
|
2024-11-05 17:14:56 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def calc_typical_doc_metrics_v2():
|
|
|
|
|
"""
|
|
|
|
|
Statistics metrics for typical document.
|
|
|
|
|
1. Fund level datapoint: TOR
|
|
|
|
|
2. Share level datapoint: OGC, TER, Performance fees
|
|
|
|
|
3. Only statistics the record which with document investment mapping
|
|
|
|
|
"""
|
|
|
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
|
|
|
result_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_20_new_emea_documents_sample_Accuracy.xlsx"
|
|
|
|
|
sheet_name = "record_level_Results"
|
|
|
|
|
data = pd.read_excel(result_file, sheet_name=sheet_name)
|
|
|
|
|
data.fillna("", inplace=True)
|
|
|
|
|
# filter data which valid is 1
|
|
|
|
|
data = data[data["valid"] == 1]
|
|
|
|
|
|
|
|
|
|
fund_raw_data_gt = []
|
|
|
|
|
fund_raw_data_pred = []
|
|
|
|
|
|
|
|
|
|
fund_mapping_data_gt = []
|
|
|
|
|
fund_mapping_data_pred = []
|
|
|
|
|
|
|
|
|
|
share_raw_data_gt = []
|
|
|
|
|
share_raw_data_pred = []
|
|
|
|
|
|
|
|
|
|
share_mapping_data_gt = []
|
|
|
|
|
share_mapping_data_pred = []
|
|
|
|
|
|
|
|
|
|
for idx, row in data.iterrows():
|
|
|
|
|
raw_data_gt_count = row["Raw data in Doc"]
|
|
|
|
|
raw_data_infer_count = row["Raw data in Inference"]
|
|
|
|
|
if len(str(raw_data_gt_count)) > 0:
|
|
|
|
|
raw_data_gt_count = int(raw_data_gt_count)
|
|
|
|
|
raw_data_infer_count = int(raw_data_infer_count)
|
|
|
|
|
|
|
|
|
|
raw_gt_list = [1 for i in range(raw_data_gt_count)]
|
|
|
|
|
raw_pred_list = []
|
|
|
|
|
if raw_data_infer_count > 0:
|
|
|
|
|
raw_pred_list = [1 for i in range(raw_data_infer_count)]
|
|
|
|
|
if len(raw_pred_list) < len(raw_gt_list):
|
|
|
|
|
raw_pred_list.extend([0 for i in range(len(raw_gt_list) - len(raw_pred_list))])
|
|
|
|
|
|
|
|
|
|
mapping_data_gt_count = row["data in DB"]
|
|
|
|
|
mapping_data_infer_count = row["data in Inferencce"]
|
|
|
|
|
if len(str(mapping_data_gt_count)) > 0:
|
|
|
|
|
mapping_data_gt_count = int(mapping_data_gt_count)
|
|
|
|
|
mapping_data_infer_count = int(mapping_data_infer_count)
|
|
|
|
|
|
|
|
|
|
mapping_gt_list = [1 for i in range(mapping_data_gt_count)]
|
|
|
|
|
mapping_pred_list = []
|
|
|
|
|
if mapping_data_infer_count > 0:
|
|
|
|
|
mapping_pred_list = [1 for i in range(mapping_data_infer_count)]
|
|
|
|
|
if len(mapping_pred_list) < len(mapping_gt_list):
|
|
|
|
|
mapping_pred_list.extend([0 for i in range(len(mapping_gt_list) - len(mapping_pred_list))])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_level = row["data_level"]
|
|
|
|
|
if data_level == "fund":
|
|
|
|
|
fund_raw_data_gt.extend(raw_gt_list)
|
|
|
|
|
fund_raw_data_pred.extend(raw_pred_list)
|
|
|
|
|
|
|
|
|
|
fund_mapping_data_gt.extend(mapping_gt_list)
|
|
|
|
|
fund_mapping_data_pred.extend(mapping_pred_list)
|
|
|
|
|
else:
|
|
|
|
|
share_raw_data_gt.extend(raw_gt_list)
|
|
|
|
|
share_raw_data_pred.extend(raw_pred_list)
|
|
|
|
|
|
|
|
|
|
share_mapping_data_gt.extend(mapping_gt_list)
|
|
|
|
|
share_mapping_data_pred.extend(mapping_pred_list)
|
|
|
|
|
|
|
|
|
|
share_raw_data_gt.extend([0, 0, 0, 0, 0, 0])
|
|
|
|
|
share_raw_data_pred.extend([1, 1, 1, 1, 1, 1])
|
|
|
|
|
|
|
|
|
|
share_mapping_data_gt.extend([0, 0, 0, 0, 0, 0])
|
|
|
|
|
share_mapping_data_pred.extend([1, 1, 1, 1, 1, 1])
|
|
|
|
|
|
|
|
|
|
fund_raw_data_accuracy = accuracy_score(fund_raw_data_gt, fund_raw_data_pred)
|
|
|
|
|
fund_raw_data_precision = precision_score(fund_raw_data_gt, fund_raw_data_pred)
|
|
|
|
|
fund_raw_data_recall = recall_score(fund_raw_data_gt, fund_raw_data_pred)
|
|
|
|
|
fund_raw_data_f1 = f1_score(fund_raw_data_gt, fund_raw_data_pred)
|
|
|
|
|
|
|
|
|
|
fund_mapping_data_accuracy = accuracy_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
|
|
|
|
fund_mapping_data_precision = precision_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
|
|
|
|
fund_mapping_data_recall = recall_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
|
|
|
|
fund_mapping_data_f1 = f1_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
|
|
|
|
|
|
|
|
|
share_raw_data_accuracy = accuracy_score(share_raw_data_gt, share_raw_data_pred)
|
|
|
|
|
share_raw_data_precision = precision_score(share_raw_data_gt, share_raw_data_pred)
|
|
|
|
|
share_raw_data_recall = recall_score(share_raw_data_gt, share_raw_data_pred)
|
|
|
|
|
share_raw_data_f1 = f1_score(share_raw_data_gt, share_raw_data_pred)
|
|
|
|
|
|
|
|
|
|
share_mapping_data_accuracy = accuracy_score(share_mapping_data_gt, share_mapping_data_pred)
|
|
|
|
|
share_mapping_data_precision = precision_score(share_mapping_data_gt, share_mapping_data_pred)
|
|
|
|
|
share_mapping_data_recall = recall_score(share_mapping_data_gt, share_mapping_data_pred)
|
|
|
|
|
share_mapping_data_f1 = f1_score(share_mapping_data_gt, share_mapping_data_pred)
|
|
|
|
|
|
|
|
|
|
final_data = []
|
|
|
|
|
|
|
|
|
|
fund_raw_data_metrics = {"title": "Fund_Datapoint_Raw_Data",
|
|
|
|
|
"accuracy": fund_raw_data_accuracy,
|
|
|
|
|
"precision": fund_raw_data_precision,
|
|
|
|
|
"recall": fund_raw_data_recall,
|
|
|
|
|
"f1": fund_raw_data_f1,
|
|
|
|
|
"support": len(fund_raw_data_gt)}
|
|
|
|
|
final_data.append(fund_raw_data_metrics)
|
|
|
|
|
logger.info(f"fund_raw_data_accuracy: {fund_raw_data_accuracy}")
|
|
|
|
|
logger.info(f"fund_raw_data_precision: {fund_raw_data_precision}")
|
|
|
|
|
logger.info(f"fund_raw_data_recall: {fund_raw_data_recall}")
|
|
|
|
|
logger.info(f"fund_raw_data_f1: {fund_raw_data_f1}")
|
|
|
|
|
logger.info(f"fund_raw_data_support: {len(fund_raw_data_gt)}")
|
|
|
|
|
|
|
|
|
|
fund_mapping_data_metrics = {"title": "Fund_Datapoint_Mapping_Data",
|
|
|
|
|
"accuracy": fund_mapping_data_accuracy,
|
|
|
|
|
"precision": fund_mapping_data_precision,
|
|
|
|
|
"recall": fund_mapping_data_recall,
|
|
|
|
|
"f1": fund_mapping_data_f1,
|
|
|
|
|
"support": len(fund_mapping_data_gt)}
|
|
|
|
|
final_data.append(fund_mapping_data_metrics)
|
|
|
|
|
logger.info(f"fund_mapping_data_accuracy: {fund_mapping_data_accuracy}")
|
|
|
|
|
logger.info(f"fund_mapping_data_precision: {fund_mapping_data_precision}")
|
|
|
|
|
logger.info(f"fund_mapping_data_recall: {fund_mapping_data_recall}")
|
|
|
|
|
logger.info(f"fund_mapping_data_f1: {fund_mapping_data_f1}")
|
|
|
|
|
logger.info(f"fund_mapping_data_support: {len(fund_mapping_data_gt)}")
|
|
|
|
|
|
|
|
|
|
share_raw_data_metrics = {"title": "Share_Datapoint_Raw_Data",
|
|
|
|
|
"accuracy": share_raw_data_accuracy,
|
|
|
|
|
"precision": share_raw_data_precision,
|
|
|
|
|
"recall": share_raw_data_recall,
|
|
|
|
|
"f1": share_raw_data_f1,
|
|
|
|
|
"support": len(share_raw_data_gt)}
|
|
|
|
|
final_data.append(share_raw_data_metrics)
|
|
|
|
|
logger.info(f"share_raw_data_accuracy: {share_raw_data_accuracy}")
|
|
|
|
|
logger.info(f"share_raw_data_precision: {share_raw_data_precision}")
|
|
|
|
|
logger.info(f"share_raw_data_recall: {share_raw_data_recall}")
|
|
|
|
|
logger.info(f"share_raw_data_f1: {share_raw_data_f1}")
|
|
|
|
|
logger.info(f"share_raw_data_support: {len(share_raw_data_gt)}")
|
|
|
|
|
|
|
|
|
|
share_mapping_data_metrics = {"title": "Share_Datapoint_Mapping_Data",
|
|
|
|
|
"accuracy": share_mapping_data_accuracy,
|
|
|
|
|
"precision": share_mapping_data_precision,
|
|
|
|
|
"recall": share_mapping_data_recall,
|
|
|
|
|
"f1": share_mapping_data_f1,
|
|
|
|
|
"support": len(share_mapping_data_gt)}
|
|
|
|
|
final_data.append(share_mapping_data_metrics)
|
|
|
|
|
logger.info(f"share_mapping_data_accuracy: {share_mapping_data_accuracy}")
|
|
|
|
|
logger.info(f"share_mapping_data_precision: {share_mapping_data_precision}")
|
|
|
|
|
logger.info(f"share_mapping_data_recall: {share_mapping_data_recall}")
|
|
|
|
|
logger.info(f"share_mapping_data_f1: {share_mapping_data_f1}")
|
|
|
|
|
logger.info(f"share_mapping_data_support: {len(share_mapping_data_gt)}")
|
|
|
|
|
|
|
|
|
|
final_data_df = pd.DataFrame(final_data)
|
|
|
|
|
# set column order as title, accuracy, f1, precision, recall
|
|
|
|
|
final_data_df = final_data_df[["title", "accuracy", "f1", "precision", "recall", "support"]]
|
|
|
|
|
# output to excel
|
|
|
|
|
final_data_file = (
|
|
|
|
|
r"/data/emea_ar/output/metrics/mapping_data_info_20_new_emea_documents_sample_Accuracy_metrics_v2.xlsx"
|
|
|
|
|
)
|
|
|
|
|
with pd.ExcelWriter(final_data_file) as writer:
|
|
|
|
|
final_data_df.to_excel(
|
|
|
|
|
writer, sheet_name="metrics", index=False
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calc_typical_doc_metrics_v1():
|
|
|
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
|
|
|
result_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_20_new_emea_documents_sample_Accuracy.xlsx"
|
|
|
|
|
sheet_name = "record_level_Results"
|
|
|
|
|
data = pd.read_excel(result_file, sheet_name=sheet_name)
|
|
|
|
|
data.fillna("", inplace=True)
|
|
|
|
|
fund_raw_data_list = data["Raw Mapping"].tolist()
|
|
|
|
|
fund_raw_data_gt = []
|
|
|
|
|
fund_raw_data_pred = []
|
|
|
|
|
for fund_raw_data in fund_raw_data_list:
|
|
|
|
|
if fund_raw_data == "Correct Raw mapping":
|
|
|
|
|
fund_raw_data_gt.append(1)
|
|
|
|
|
fund_raw_data_pred.append(1)
|
|
|
|
|
elif fund_raw_data == "Incorrect Raw mapping":
|
|
|
|
|
fund_raw_data_gt.append(1)
|
|
|
|
|
fund_raw_data_pred.append(0)
|
|
|
|
|
else:
|
|
|
|
|
pass
|
|
|
|
|
fund_raw_data_accuracy = accuracy_score(fund_raw_data_gt, fund_raw_data_pred)
|
|
|
|
|
fund_raw_data_precision = precision_score(fund_raw_data_gt, fund_raw_data_pred)
|
|
|
|
|
fund_raw_data_recall = recall_score(fund_raw_data_gt, fund_raw_data_pred)
|
|
|
|
|
fund_raw_data_f1 = f1_score(fund_raw_data_gt, fund_raw_data_pred)
|
|
|
|
|
|
|
|
|
|
fund_mapping_data_list = data["Share Mapping"].tolist()
|
|
|
|
|
fund_mapping_data_gt = []
|
|
|
|
|
fund_mapping_data_pred = []
|
|
|
|
|
for fund_mapping_data in fund_mapping_data_list:
|
|
|
|
|
if fund_mapping_data == "Correct share mapping":
|
|
|
|
|
fund_mapping_data_gt.append(1)
|
|
|
|
|
fund_mapping_data_pred.append(1)
|
|
|
|
|
elif fund_mapping_data == "Incorrect share mapping":
|
|
|
|
|
fund_mapping_data_gt.append(1)
|
|
|
|
|
fund_mapping_data_pred.append(0)
|
|
|
|
|
else:
|
|
|
|
|
pass
|
|
|
|
|
fund_mapping_data_accuracy = accuracy_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
|
|
|
|
fund_mapping_data_precision = precision_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
|
|
|
|
fund_mapping_data_recall = recall_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
|
|
|
|
fund_mapping_data_f1 = f1_score(fund_mapping_data_gt, fund_mapping_data_pred)
|
|
|
|
|
|
|
|
|
|
share_raw_data_gt = []
|
|
|
|
|
share_raw_data_pred = []
|
|
|
|
|
|
|
|
|
|
share_mapping_data_gt = []
|
|
|
|
|
share_mapping_data_pred = []
|
|
|
|
|
for idx, row in data.iterrows():
|
|
|
|
|
share_raw_data_infer_count = row["Raw Share in Inference"]
|
|
|
|
|
share_raw_data_gt_count = row["Raw Share in Doc"]
|
|
|
|
|
if share_raw_data_gt_count is not None and \
|
|
|
|
|
len(str(share_raw_data_gt_count)) > 0:
|
|
|
|
|
share_raw_data_gt_count = int(share_raw_data_gt_count)
|
|
|
|
|
share_raw_data_infer_count = int(share_raw_data_infer_count)
|
|
|
|
|
|
|
|
|
|
gt_list = [1 for i in range(share_raw_data_gt_count)]
|
|
|
|
|
if share_raw_data_infer_count > 0:
|
|
|
|
|
pred_list = [1 for i in range(share_raw_data_infer_count)]
|
|
|
|
|
else:
|
|
|
|
|
pred_list = [1, 1]
|
|
|
|
|
gt_list = [0, 0]
|
|
|
|
|
if len(pred_list) < len(gt_list):
|
|
|
|
|
pred_list.extend([0 for i in range(len(gt_list) - len(pred_list))])
|
|
|
|
|
share_raw_data_gt.extend(gt_list)
|
|
|
|
|
share_raw_data_pred.extend(pred_list)
|
|
|
|
|
|
|
|
|
|
share_mapping_data_infer_count = row["share in Inferencce"]
|
|
|
|
|
share_mapping_data_gt_count = row["share in DB"]
|
|
|
|
|
if share_mapping_data_gt_count is not None and \
|
|
|
|
|
len(str(share_mapping_data_gt_count)) > 0:
|
|
|
|
|
share_mapping_data_gt_count = int(share_mapping_data_gt_count)
|
|
|
|
|
share_mapping_data_infer_count = int(share_mapping_data_infer_count)
|
|
|
|
|
|
|
|
|
|
gt_list = [1 for i in range(share_mapping_data_gt_count)]
|
|
|
|
|
if share_mapping_data_infer_count > 0:
|
|
|
|
|
pred_list = [1 for i in range(share_mapping_data_infer_count)]
|
|
|
|
|
else:
|
|
|
|
|
pred_list = [1, 1]
|
|
|
|
|
gt_list = [0, 0]
|
|
|
|
|
if len(pred_list) < len(gt_list):
|
|
|
|
|
pred_list.extend([0 for i in range(len(gt_list) - len(pred_list))])
|
|
|
|
|
share_mapping_data_gt.extend(gt_list)
|
|
|
|
|
share_mapping_data_pred.extend(pred_list)
|
|
|
|
|
share_raw_data_accuracy = accuracy_score(share_raw_data_gt, share_raw_data_pred)
|
|
|
|
|
share_raw_data_precision = precision_score(share_raw_data_gt, share_raw_data_pred)
|
|
|
|
|
share_raw_data_recall = recall_score(share_raw_data_gt, share_raw_data_pred)
|
|
|
|
|
share_raw_data_f1 = f1_score(share_raw_data_gt, share_raw_data_pred)
|
|
|
|
|
|
|
|
|
|
share_mapping_data_accuracy = accuracy_score(share_mapping_data_gt, share_mapping_data_pred)
|
|
|
|
|
share_mapping_data_precision = precision_score(share_mapping_data_gt, share_mapping_data_pred)
|
|
|
|
|
share_mapping_data_recall = recall_score(share_mapping_data_gt, share_mapping_data_pred)
|
|
|
|
|
share_mapping_data_f1 = f1_score(share_mapping_data_gt, share_mapping_data_pred)
|
|
|
|
|
|
|
|
|
|
final_data = []
|
|
|
|
|
|
|
|
|
|
fund_raw_data_metrics = {"title": "Fund_Raw_Data",
|
|
|
|
|
"accuracy": fund_raw_data_accuracy,
|
|
|
|
|
"precision": fund_raw_data_precision,
|
|
|
|
|
"recall": fund_raw_data_recall,
|
|
|
|
|
"f1": fund_raw_data_f1,
|
|
|
|
|
"support": len(fund_raw_data_gt)}
|
|
|
|
|
final_data.append(fund_raw_data_metrics)
|
|
|
|
|
logger.info(f"fund_raw_data_accuracy: {fund_raw_data_accuracy}")
|
|
|
|
|
logger.info(f"fund_raw_data_precision: {fund_raw_data_precision}")
|
|
|
|
|
logger.info(f"fund_raw_data_recall: {fund_raw_data_recall}")
|
|
|
|
|
logger.info(f"fund_raw_data_f1: {fund_raw_data_f1}")
|
|
|
|
|
|
|
|
|
|
fund_mapping_data_metrics = {"title": "Fund_Mapping_Data",
|
|
|
|
|
"accuracy": fund_mapping_data_accuracy,
|
|
|
|
|
"precision": fund_mapping_data_precision,
|
|
|
|
|
"recall": fund_mapping_data_recall,
|
|
|
|
|
"f1": fund_mapping_data_f1,
|
|
|
|
|
"support": len(fund_mapping_data_gt)}
|
|
|
|
|
final_data.append(fund_mapping_data_metrics)
|
|
|
|
|
logger.info(f"fund_mapping_data_accuracy: {fund_mapping_data_accuracy}")
|
|
|
|
|
logger.info(f"fund_mapping_data_precision: {fund_mapping_data_precision}")
|
|
|
|
|
logger.info(f"fund_mapping_data_recall: {fund_mapping_data_recall}")
|
|
|
|
|
logger.info(f"fund_mapping_data_f1: {fund_mapping_data_f1}")
|
|
|
|
|
|
|
|
|
|
share_raw_data_metrics = {"title": "Share_Raw_Data",
|
|
|
|
|
"accuracy": share_raw_data_accuracy,
|
|
|
|
|
"precision": share_raw_data_precision,
|
|
|
|
|
"recall": share_raw_data_recall,
|
|
|
|
|
"f1": share_raw_data_f1,
|
|
|
|
|
"support": len(share_raw_data_gt)}
|
|
|
|
|
final_data.append(share_raw_data_metrics)
|
|
|
|
|
logger.info(f"share_raw_data_accuracy: {share_raw_data_accuracy}")
|
|
|
|
|
logger.info(f"share_raw_data_precision: {share_raw_data_precision}")
|
|
|
|
|
logger.info(f"share_raw_data_recall: {share_raw_data_recall}")
|
|
|
|
|
logger.info(f"share_raw_data_f1: {share_raw_data_f1}")
|
|
|
|
|
|
|
|
|
|
share_mapping_data_metrics = {"title": "Share_Mapping_Data",
|
|
|
|
|
"accuracy": share_mapping_data_accuracy,
|
|
|
|
|
"precision": share_mapping_data_precision,
|
|
|
|
|
"recall": share_mapping_data_recall,
|
|
|
|
|
"f1": share_mapping_data_f1,
|
|
|
|
|
"support": len(share_mapping_data_gt)}
|
|
|
|
|
final_data.append(share_mapping_data_metrics)
|
|
|
|
|
logger.info(f"share_mapping_data_accuracy: {share_mapping_data_accuracy}")
|
|
|
|
|
logger.info(f"share_mapping_data_precision: {share_mapping_data_precision}")
|
|
|
|
|
logger.info(f"share_mapping_data_recall: {share_mapping_data_recall}")
|
|
|
|
|
logger.info(f"share_mapping_data_f1: {share_mapping_data_f1}")
|
|
|
|
|
|
|
|
|
|
final_data_df = pd.DataFrame(final_data)
|
|
|
|
|
# set column order as title, accuracy, f1, precision, recall
|
|
|
|
|
final_data_df = final_data_df[["title", "accuracy", "f1", "precision", "recall", "support"]]
|
|
|
|
|
# output to excel
|
|
|
|
|
final_data_file = (
|
|
|
|
|
r"/data/emea_ar/output/metrics/mapping_data_info_20_new_emea_documents_sample_Accuracy_metrics.xlsx"
|
|
|
|
|
)
|
|
|
|
|
with pd.ExcelWriter(final_data_file) as writer:
|
|
|
|
|
final_data_df.to_excel(
|
|
|
|
|
writer, sheet_name="metrics", index=False
|
|
|
|
|
)
|
2024-11-06 22:39:42 +00:00
|
|
|
|
|
|
|
|
|
2025-02-11 17:49:53 +00:00
|
|
|
def merge_aus_document_prospectus_data(aus_data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
|
|
|
|
|
aus_document_mapping_file: str = r"document_mapping.xlsx",
|
|
|
|
|
aus_prospectus_data_file: str = r"aus_prospectus_data.xlsx",
|
|
|
|
|
document_mapping_sheet: str = "document_mapping",
|
|
|
|
|
output_file: str = r"aus_document_prospectus.xlsx",
|
|
|
|
|
output_sheet: str = "aus_document_prospectus"):
|
2025-01-07 22:25:13 +00:00
|
|
|
"""
|
|
|
|
|
Merge AUS document and prospectus data.
|
|
|
|
|
"""
|
2025-02-11 17:49:53 +00:00
|
|
|
aus_document_mapping_file = os.path.join(aus_data_folder, aus_document_mapping_file)
|
|
|
|
|
aus_prospectus_data_file = os.path.join(aus_data_folder, aus_prospectus_data_file)
|
|
|
|
|
aus_document_data = pd.read_excel(aus_document_mapping_file, sheet_name=document_mapping_sheet)
|
|
|
|
|
aus_prospectus_data = pd.read_excel(aus_prospectus_data_file)
|
2025-01-07 22:25:13 +00:00
|
|
|
|
|
|
|
|
aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str)
|
|
|
|
|
|
|
|
|
|
aus_document_prospectus_data = pd.merge(
|
|
|
|
|
aus_document_data,
|
|
|
|
|
aus_prospectus_data,
|
|
|
|
|
on=["FundClassId", "EffectiveDate"],
|
|
|
|
|
how="left",
|
|
|
|
|
)
|
2025-02-11 17:49:53 +00:00
|
|
|
aus_document_prospectus_file = os.path.join(aus_data_folder, output_file)
|
2025-01-07 22:25:13 +00:00
|
|
|
with pd.ExcelWriter(aus_document_prospectus_file) as writer:
|
|
|
|
|
aus_document_prospectus_data.to_excel(
|
2025-02-11 17:49:53 +00:00
|
|
|
writer, sheet_name=output_sheet, index=False
|
2025-01-07 22:25:13 +00:00
|
|
|
)
|
2025-01-17 22:26:31 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def pdf_exist():
|
|
|
|
|
data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
|
|
|
|
|
data_file = os.path.join(data_folder, "aus_100_document_prospectus_multi_fund.xlsx")
|
|
|
|
|
percentile_result_df = pd.read_excel(data_file, sheet_name="percentile_result")
|
|
|
|
|
document_id_list = percentile_result_df["DocumentId"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
pdf_doc_path = r"/data/aus_prospectus/pdf/"
|
|
|
|
|
for doc_id in document_id_list:
|
|
|
|
|
pdf_file_path = os.path.join(pdf_doc_path, f"{doc_id}.pdf")
|
|
|
|
|
if not os.path.exists(pdf_file_path):
|
|
|
|
|
logger.error(f"pdf file not exist: {pdf_file_path}")
|
|
|
|
|
else:
|
|
|
|
|
logger.info(f"pdf file exist: {pdf_file_path}")
|
|
|
|
|
|
2025-01-14 22:21:48 +00:00
|
|
|
|
2025-02-11 17:49:53 +00:00
|
|
|
def prepare_multi_fund_aus_prospectus_document(data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
|
|
|
|
|
document_mapping_file: str = "document_mapping.xlsx",
|
|
|
|
|
document_mapping_sheet: str = "document_mapping",
|
|
|
|
|
document_fund_count_sheet: str = "document_fund_count",
|
|
|
|
|
document_data_file: str = "aus_document_prospectus.xlsx",
|
|
|
|
|
document_data_sheet: str = "aus_document_prospectus"):
|
|
|
|
|
document_mapping_file = os.path.join(data_folder, document_mapping_file)
|
|
|
|
|
document_data_file = os.path.join(data_folder, document_data_file)
|
|
|
|
|
|
|
|
|
|
document_mapping_df = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet)
|
|
|
|
|
document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name=document_fund_count_sheet)
|
|
|
|
|
|
|
|
|
|
document_data_df = pd.read_excel(document_data_file, sheet_name=document_data_sheet)
|
2025-01-17 22:26:31 +00:00
|
|
|
document_data_df.fillna("", inplace=True)
|
|
|
|
|
# get data from document_data_df which SecurityName is not empty string
|
|
|
|
|
document_data_df = document_data_df[document_data_df["SecurityName"] != ""]
|
|
|
|
|
document_id_list = document_data_df["DocumentId"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
# get document which fund count > 1
|
|
|
|
|
document_fund_count_df = document_fund_count_df[document_fund_count_df["DocumentId"].isin(document_id_list)]
|
|
|
|
|
document_fund_count_df = document_fund_count_df[document_fund_count_df["DistinctFundCount"] > 1]
|
|
|
|
|
document_fund_count_df = document_fund_count_df.sort_values(by="DistinctFundCount", ascending=False)
|
|
|
|
|
# Calculate percentile
|
|
|
|
|
percentiles = [0, 0.3, 0.6, 1]
|
|
|
|
|
quantile_values = document_fund_count_df['DistinctFundCount'].quantile(percentiles)
|
2025-01-14 22:21:48 +00:00
|
|
|
|
2025-01-17 22:26:31 +00:00
|
|
|
# Group by percentile
|
|
|
|
|
bins = [quantile_values[0], quantile_values[0.3], quantile_values[0.6], quantile_values[1]]
|
|
|
|
|
document_fund_count_df['Percentile_Group'] = pd.cut(document_fund_count_df['DistinctFundCount'], bins=bins, labels=["0-30", "30-60", "60-100"], include_lowest=True)
|
|
|
|
|
|
|
|
|
|
# Get relevant samples based on percentile group
|
|
|
|
|
percentile_result = pd.DataFrame()
|
|
|
|
|
for group, count in zip(["0-30", "30-60", "60-100"], [30, 30, 40]):
|
|
|
|
|
group_df = document_fund_count_df[document_fund_count_df['Percentile_Group'] == group]
|
|
|
|
|
sampled_df = group_df.sample(n=min(len(group_df), count), random_state=42)
|
|
|
|
|
percentile_result = pd.concat([percentile_result, sampled_df], ignore_index=True)
|
|
|
|
|
percentile_result.reset_index(drop=True, inplace=True)
|
|
|
|
|
document_id_list = percentile_result["DocumentId"].unique().tolist()
|
|
|
|
|
final_document_mapping_df = document_mapping_df[document_mapping_df["DocumentId"].isin(document_id_list)]
|
|
|
|
|
# order by DocumentId, FundLegalName, FundClassLegalName
|
|
|
|
|
final_document_mapping_df = final_document_mapping_df.sort_values(by=["DocumentId", "FundLegalName", "FundClassLegalName"], ascending=True)
|
|
|
|
|
final_document_mapping_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
# get CompanyId, CompanyName from final_document_mapping_df
|
|
|
|
|
final_document_provider_df = final_document_mapping_df[["CompanyId", "CompanyName"]].drop_duplicates()
|
|
|
|
|
# order by CompanyName
|
|
|
|
|
final_document_provider_df = final_document_provider_df.sort_values(by="CompanyName", ascending=True)
|
|
|
|
|
final_document_provider_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
final_document_data_df = document_data_df[document_data_df["DocumentId"].isin(document_id_list)]
|
|
|
|
|
# order by DocumentId, FundLegalName, FundClassLegalName
|
|
|
|
|
final_document_data_df = final_document_data_df.sort_values(by=["DocumentId", "FundLegalName", "FundClassLegalName"], ascending=True)
|
|
|
|
|
final_document_data_df.reset_index(drop=True, inplace=True)
|
|
|
|
|
|
|
|
|
|
output_file = os.path.join(data_folder, "aus_100_document_prospectus_multi_fund.xlsx")
|
|
|
|
|
with pd.ExcelWriter(output_file) as writer:
|
|
|
|
|
final_document_mapping_df.to_excel(
|
|
|
|
|
writer, sheet_name="document_mapping", index=False
|
|
|
|
|
)
|
|
|
|
|
final_document_provider_df.to_excel(
|
|
|
|
|
writer, sheet_name="document_provider", index=False
|
|
|
|
|
)
|
|
|
|
|
final_document_data_df.to_excel(
|
|
|
|
|
writer, sheet_name="aus_document_data", index=False
|
|
|
|
|
)
|
|
|
|
|
percentile_result.to_excel(
|
|
|
|
|
writer, sheet_name="percentile_result", index=False
|
|
|
|
|
)
|
|
|
|
|
output_sample_document_file = os.path.join(r"./sample_documents/",
|
|
|
|
|
"aus_prospectus_100_documents_multi_fund_sample.txt")
|
|
|
|
|
# output document id to txt file
|
|
|
|
|
with open(output_sample_document_file, "w") as f:
|
|
|
|
|
for doc_id in document_id_list:
|
|
|
|
|
f.write(f"{doc_id}\n")
|
2025-03-07 21:02:12 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_mapping_to_ravi_data():
|
|
|
|
|
data_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx"
|
|
|
|
|
data_sheet = "Sheet1"
|
|
|
|
|
mapping_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
|
|
|
|
mapping_sheet = "document_mapping"
|
|
|
|
|
output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"
|
|
|
|
|
set_mapping_to_raw_name_data(data_file_path, data_sheet, mapping_file_path, mapping_sheet, output_file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_mapping_to_data_side_documents_data():
|
|
|
|
|
# data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx"
|
|
|
|
|
# data_sheet = "all"
|
|
|
|
|
# mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
|
|
|
|
# mapping_sheet = "document_mapping"
|
|
|
|
|
# output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx"
|
|
|
|
|
|
2025-03-24 22:10:16 +00:00
|
|
|
# data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx"
|
|
|
|
|
# data_sheet = "ground_truth"
|
|
|
|
|
# raw_name_column = "raw_share_name"
|
|
|
|
|
# mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
|
|
|
|
|
# mapping_sheet = "document_mapping"
|
|
|
|
|
# raw_name_mapping_column = None
|
|
|
|
|
# output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
|
|
|
|
|
|
|
|
|
data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth.xlsx"
|
2025-03-07 21:02:12 +00:00
|
|
|
data_sheet = "ground_truth"
|
|
|
|
|
raw_name_column = "raw_share_name"
|
2025-03-24 22:10:16 +00:00
|
|
|
mapping_file_path = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx"
|
2025-03-07 21:02:12 +00:00
|
|
|
mapping_sheet = "document_mapping"
|
|
|
|
|
raw_name_mapping_column = None
|
2025-03-24 22:10:16 +00:00
|
|
|
output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx"
|
2025-03-07 21:02:12 +00:00
|
|
|
set_mapping_to_raw_name_data(data_file_path=data_file_path,
|
|
|
|
|
data_sheet=data_sheet,
|
|
|
|
|
raw_name_column=raw_name_column,
|
|
|
|
|
mapping_file_path=mapping_file_path,
|
|
|
|
|
mapping_sheet=mapping_sheet,
|
|
|
|
|
raw_name_mapping_column=raw_name_mapping_column,
|
|
|
|
|
output_file_path=output_file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx",
|
|
|
|
|
data_sheet: str = "Sheet1",
|
|
|
|
|
raw_name_column: str = "raw_share_name",
|
|
|
|
|
mapping_file_path: str = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx",
|
|
|
|
|
mapping_sheet: str = "document_mapping",
|
|
|
|
|
raw_name_mapping_column: str = None,
|
|
|
|
|
output_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"):
|
|
|
|
|
data_df = pd.read_excel(data_file_path, sheet_name=data_sheet)
|
|
|
|
|
data_df["provider_id"] = ""
|
|
|
|
|
data_df["provider_name"] = ""
|
|
|
|
|
data_df["fund_id"] = ""
|
|
|
|
|
data_df["fund_name"] = ""
|
|
|
|
|
data_df["sec_id"] = ""
|
|
|
|
|
data_df["sec_name"] = ""
|
|
|
|
|
|
|
|
|
|
mapping_data = pd.read_excel(mapping_file_path, sheet_name=mapping_sheet)
|
|
|
|
|
|
|
|
|
|
doc_id_list = data_df["doc_id"].unique().tolist()
|
|
|
|
|
for doc_id in doc_id_list:
|
|
|
|
|
doc_data = data_df[data_df["doc_id"] == doc_id]
|
|
|
|
|
raw_name_list = doc_data[raw_name_column].unique().tolist()
|
2025-01-07 22:25:13 +00:00
|
|
|
|
2025-03-07 21:02:12 +00:00
|
|
|
doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id]
|
|
|
|
|
if len(doc_mapping_data) == 0:
|
|
|
|
|
continue
|
|
|
|
|
provider_id = doc_mapping_data["CompanyId"].values[0]
|
|
|
|
|
provider_name = doc_mapping_data["CompanyName"].values[0]
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id), "provider_id"] = provider_id
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id), "provider_name"] = provider_name
|
|
|
|
|
if raw_name_mapping_column is not None and raw_name_mapping_column == "FundLegalName":
|
|
|
|
|
doc_db_name_list = doc_mapping_data[raw_name_mapping_column].unique().tolist()
|
|
|
|
|
for raw_name in raw_name_list:
|
|
|
|
|
find_df = doc_mapping_data[doc_mapping_data[raw_name_mapping_column] == raw_name]
|
|
|
|
|
if find_df is not None and len(find_df) == 1:
|
|
|
|
|
sec_id = find_df["FundClassId"].values[0]
|
|
|
|
|
sec_name = find_df["FundClassLegalName"].values[0]
|
|
|
|
|
fund_id = find_df["FundId"].values[0]
|
|
|
|
|
fund_name = find_df["FundLegalName"].values[0]
|
|
|
|
|
# update doc_data which raw_share_name is same as raw_share_name
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_id"] = sec_id
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_name"] = sec_name
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_id"] = fund_id
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_name"] = fund_name
|
|
|
|
|
else:
|
|
|
|
|
doc_db_name_list = doc_mapping_data["FundClassLegalName"].unique().tolist()
|
|
|
|
|
all_match_result = get_raw_name_db_match_result(doc_id,
|
|
|
|
|
provider_name,
|
|
|
|
|
raw_name_list,
|
|
|
|
|
doc_db_name_list,
|
|
|
|
|
iter_count=60)
|
|
|
|
|
for raw_share_name in raw_name_list:
|
|
|
|
|
if all_match_result.get(raw_share_name) is not None:
|
|
|
|
|
matched_db_share_name = all_match_result[raw_share_name]
|
|
|
|
|
if (
|
|
|
|
|
matched_db_share_name is not None
|
|
|
|
|
and len(matched_db_share_name) > 0
|
|
|
|
|
):
|
|
|
|
|
# get SecId from self.doc_fund_class_mapping
|
|
|
|
|
find_share_df = doc_mapping_data[doc_mapping_data["FundClassLegalName"] == matched_db_share_name]
|
|
|
|
|
if find_share_df is not None and len(find_share_df) > 0:
|
|
|
|
|
sec_id = find_share_df["FundClassId"].values[0]
|
|
|
|
|
fund_id = find_share_df["FundId"].values[0]
|
|
|
|
|
fund_name = find_share_df["FundLegalName"].values[0]
|
|
|
|
|
# update doc_data which raw_share_name is same as raw_share_name
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_id"] = sec_id
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_name"] = matched_db_share_name
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_id"] = fund_id
|
|
|
|
|
data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_name"] = fund_name
|
|
|
|
|
try:
|
|
|
|
|
data_df = data_df[["doc_id",
|
|
|
|
|
"provider_id",
|
|
|
|
|
"provider_name",
|
|
|
|
|
"raw_fund_name",
|
|
|
|
|
"fund_id",
|
|
|
|
|
"fund_name",
|
|
|
|
|
"raw_share_name",
|
|
|
|
|
"sec_id",
|
|
|
|
|
"sec_name",
|
|
|
|
|
"management_fee_and_costs",
|
|
|
|
|
"management_fee",
|
|
|
|
|
"administration_fees",
|
|
|
|
|
"minimum_initial_investment",
|
|
|
|
|
"benchmark_name",
|
2025-03-24 22:10:16 +00:00
|
|
|
"performance_fee_costs",
|
2025-03-07 21:02:12 +00:00
|
|
|
"buy_spread",
|
|
|
|
|
"sell_spread",
|
|
|
|
|
"total_annual_dollar_based_charges",
|
|
|
|
|
"interposed_vehicle_performance_fee_cost",
|
|
|
|
|
"establishment_fee",
|
|
|
|
|
"contribution_fee",
|
|
|
|
|
"withdrawal_fee",
|
|
|
|
|
"exit_fee",
|
|
|
|
|
"switching_fee",
|
2025-03-24 22:10:16 +00:00
|
|
|
"activity_fee"
|
2025-03-07 21:02:12 +00:00
|
|
|
]]
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
with open(output_file_path, "wb") as file:
|
|
|
|
|
data_df.to_excel(file, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_raw_name_db_match_result(
|
|
|
|
|
doc_id: str, provider_name: str, raw_name_list: list, doc_share_name_list: list, iter_count: int = 30
|
|
|
|
|
):
|
|
|
|
|
# split raw_name_list into several parts which each part is with 30 elements
|
|
|
|
|
# The reason to split is to avoid invoke token limitation issues from CahtGPT
|
|
|
|
|
raw_name_list_parts = [
|
|
|
|
|
raw_name_list[i : i + iter_count]
|
|
|
|
|
for i in range(0, len(raw_name_list), iter_count)
|
|
|
|
|
]
|
|
|
|
|
all_match_result = {}
|
|
|
|
|
doc_share_name_list = deepcopy(doc_share_name_list)
|
|
|
|
|
for raw_name_list in raw_name_list_parts:
|
|
|
|
|
match_result, doc_share_name_list = get_final_function_to_match(
|
|
|
|
|
doc_id, provider_name, raw_name_list, doc_share_name_list
|
|
|
|
|
)
|
|
|
|
|
all_match_result.update(match_result)
|
|
|
|
|
return all_match_result
|
|
|
|
|
|
|
|
|
|
def get_final_function_to_match(doc_id, provider_name, raw_name_list, db_name_list):
|
|
|
|
|
if len(db_name_list) == 0:
|
|
|
|
|
match_result = {}
|
|
|
|
|
for raw_name in raw_name_list:
|
|
|
|
|
match_result[raw_name] = ""
|
|
|
|
|
else:
|
|
|
|
|
match_result = final_function_to_match(
|
|
|
|
|
doc_id=doc_id,
|
|
|
|
|
pred_list=raw_name_list,
|
|
|
|
|
db_list=db_name_list,
|
|
|
|
|
provider_name=provider_name,
|
|
|
|
|
doc_source="aus_prospectus"
|
|
|
|
|
)
|
|
|
|
|
matched_name_list = list(match_result.values())
|
|
|
|
|
db_name_list = remove_matched_names(db_name_list, matched_name_list)
|
|
|
|
|
return match_result, db_name_list
|
|
|
|
|
|
|
|
|
|
def remove_matched_names(target_name_list: list, matched_name_list: list):
|
|
|
|
|
if len(matched_name_list) == 0:
|
|
|
|
|
return target_name_list
|
|
|
|
|
|
|
|
|
|
matched_name_list = list(set(matched_name_list))
|
|
|
|
|
matched_name_list = [
|
|
|
|
|
value for value in matched_name_list if value is not None and len(value) > 0
|
|
|
|
|
]
|
|
|
|
|
for matched_name in matched_name_list:
|
|
|
|
|
if (
|
|
|
|
|
matched_name is not None
|
|
|
|
|
and len(matched_name) > 0
|
|
|
|
|
and matched_name in target_name_list
|
|
|
|
|
):
|
|
|
|
|
target_name_list.remove(matched_name)
|
|
|
|
|
return target_name_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def adjust_data_file(source_file: str,
|
|
|
|
|
targe_file: str):
|
|
|
|
|
source_data = pd.read_excel(source_file, sheet_name="Sheet1")
|
|
|
|
|
source_doc_id_list = source_data["DocumentId"].unique().tolist()
|
|
|
|
|
|
|
|
|
|
target_data = pd.read_excel(targe_file, sheet_name="Sheet1")
|
|
|
|
|
#remove target_data which doc_id is in source_doc_id_list
|
|
|
|
|
target_data = target_data[~target_data["DocumentId"].isin(source_doc_id_list)]
|
|
|
|
|
# concat source_data and target_data
|
|
|
|
|
target_data = pd.concat([source_data, target_data], ignore_index=True)
|
|
|
|
|
with open(targe_file, "wb") as file:
|
|
|
|
|
target_data.to_excel(file, index=False)
|
|
|
|
|
|
2025-01-07 22:25:13 +00:00
|
|
|
|
2025-03-07 21:02:12 +00:00
|
|
|
def set_provider_to_ground_truth(groud_truth_file: str,
|
|
|
|
|
ground_truth_sheet: str,
|
|
|
|
|
document_mapping_file: str,
|
|
|
|
|
document_mapping_sheet: str):
|
|
|
|
|
ground_truth_df = pd.read_excel(groud_truth_file, sheet_name=ground_truth_sheet)
|
|
|
|
|
ground_truth_df["provider_id"] = ""
|
|
|
|
|
ground_truth_df["provider_name"] = ""
|
|
|
|
|
|
|
|
|
|
mapping_data = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet)
|
|
|
|
|
|
|
|
|
|
doc_id_list = ground_truth_df["DocumentId"].unique().tolist()
|
|
|
|
|
for doc_id in doc_id_list:
|
|
|
|
|
doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id]
|
|
|
|
|
if len(doc_mapping_data) == 0:
|
|
|
|
|
continue
|
|
|
|
|
provider_id = doc_mapping_data["CompanyId"].values[0]
|
|
|
|
|
provider_name = doc_mapping_data["CompanyName"].values[0]
|
|
|
|
|
ground_truth_df.loc[(ground_truth_df["DocumentId"] == doc_id), "provider_id"] = provider_id
|
|
|
|
|
ground_truth_df.loc[(ground_truth_df["DocumentId"] == doc_id), "provider_name"] = provider_name
|
|
|
|
|
try:
|
|
|
|
|
ground_truth_df = ground_truth_df[["DocumentId",
|
|
|
|
|
"provider_id",
|
|
|
|
|
"provider_name",
|
|
|
|
|
"raw_fund_name",
|
|
|
|
|
"FundId",
|
|
|
|
|
"FundLegalName",
|
|
|
|
|
"raw_share_name",
|
|
|
|
|
"FundClassId",
|
|
|
|
|
"FundClassLegalName",
|
|
|
|
|
"management_fee_and_costs",
|
|
|
|
|
"management_fee",
|
|
|
|
|
"administration_fees",
|
|
|
|
|
"minimum_initial_investment",
|
|
|
|
|
"benchmark_name",
|
|
|
|
|
"performance_fee",
|
|
|
|
|
"performance_fee_charged",
|
|
|
|
|
"buy_spread",
|
|
|
|
|
"sell_spread",
|
|
|
|
|
"total_annual_dollar_based_charges",
|
|
|
|
|
"interposed_vehicle_performance_fee_cost",
|
|
|
|
|
"establishment_fee",
|
|
|
|
|
"contribution_fee",
|
|
|
|
|
"withdrawal_fee",
|
|
|
|
|
"exit_fee",
|
|
|
|
|
"switching_fee",
|
|
|
|
|
"activity_fee",
|
|
|
|
|
"hurdle_rate",
|
|
|
|
|
"analyst_name"
|
|
|
|
|
]]
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
with open(groud_truth_file, "wb") as file:
|
|
|
|
|
ground_truth_df.to_excel(file, index=False)
|
2025-03-11 22:15:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_data_by_latest_ground_truth():
|
|
|
|
|
# TODO: update current ground truth data by the latest version
|
|
|
|
|
latest_ground_truth_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
2025-03-07 21:02:12 +00:00
|
|
|
|
2024-09-23 22:21:02 +00:00
|
|
|
|
2024-08-22 15:37:56 +00:00
|
|
|
if __name__ == "__main__":
|
2025-03-24 22:10:16 +00:00
|
|
|
# update_data_by_latest_ground_truth()
|
2025-03-11 22:15:39 +00:00
|
|
|
# set_provider_to_ground_truth(
|
|
|
|
|
# groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx",
|
|
|
|
|
# ground_truth_sheet="Sheet1",
|
|
|
|
|
# document_mapping_file=r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx",
|
|
|
|
|
# document_mapping_sheet="document_mapping"
|
|
|
|
|
# )
|
2025-03-07 21:02:12 +00:00
|
|
|
|
2025-03-24 22:10:16 +00:00
|
|
|
set_mapping_to_data_side_documents_data()
|
2025-03-07 21:02:12 +00:00
|
|
|
|
|
|
|
|
# source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx"
|
|
|
|
|
# target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
|
|
|
|
|
# adjust_data_file(source_file=source_file, targe_file=target_file)
|
|
|
|
|
|
2025-01-17 22:26:31 +00:00
|
|
|
# pdf_exist()
|
2025-02-11 17:49:53 +00:00
|
|
|
# prepare_multi_fund_aus_prospectus_document()
|
2025-03-07 21:02:12 +00:00
|
|
|
# merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/17_documents/",
|
|
|
|
|
# aus_document_mapping_file="aus_prospectus_17_documents_mapping.xlsx",
|
|
|
|
|
# aus_prospectus_data_file="aus_prospectus_data_17_documents_secid.xlsx",
|
|
|
|
|
# document_mapping_sheet="document_mapping",
|
|
|
|
|
# output_file="aus_prospectus_17_documents_data.xlsx",
|
|
|
|
|
# output_sheet="aus_document_prospectus")
|
2024-11-06 22:39:42 +00:00
|
|
|
folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
|
|
|
|
|
file_name = "doc_ar_data_for_emea_11_06.xlsx"
|
|
|
|
|
# get_document_with_all_4_data_points(folder, file_name, None)
|
2024-11-05 17:14:56 +00:00
|
|
|
# calc_typical_doc_metrics_v1()
|
2024-11-06 22:39:42 +00:00
|
|
|
# calc_typical_doc_metrics_v2()
|
2024-11-05 17:14:56 +00:00
|
|
|
|
2024-08-22 15:37:56 +00:00
|
|
|
doc_provider_file_path = (
|
|
|
|
|
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
|
|
|
|
|
)
|
2024-12-11 22:49:04 +00:00
|
|
|
doc_ar_data_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx"
|
2024-08-22 15:37:56 +00:00
|
|
|
provider_mapping_data_file = (
|
|
|
|
|
r"/data/emea_ar/basic_information/English/provider_mapping_data.xlsx"
|
|
|
|
|
)
|
|
|
|
|
doc_mapping_from_top_100_provider_file = (
|
|
|
|
|
r"/data/emea_ar/basic_information/English/lux_english_ar_from_top_100_provider_since_2020.xlsx"
|
|
|
|
|
)
|
|
|
|
|
basic_info_folder = r"/data/emea_ar/basic_information/English/"
|
2024-08-19 20:49:45 +00:00
|
|
|
pdf_folder = r"/data/emea_ar/pdf/"
|
|
|
|
|
output_folder = r"/data/emea_ar/output/"
|
|
|
|
|
# get_unique_docids_from_doc_provider_data(doc_provider_file_path)
|
|
|
|
|
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
|
2024-09-18 22:10:54 +00:00
|
|
|
# pdf_folder = r"/data/emea_ar/small_pdf/"
|
2024-08-23 21:38:11 +00:00
|
|
|
output_folder = r"/data/emea_ar/small_pdf_txt/"
|
|
|
|
|
random_small_document_data_file = (
|
|
|
|
|
r"/data/emea_ar/basic_information/English/lux_english_ar_top_100_provider_random_small_document.xlsx"
|
|
|
|
|
)
|
2024-09-18 22:10:54 +00:00
|
|
|
|
2024-10-28 20:15:55 +00:00
|
|
|
doc_provider_file_path = r"/data/emea_ar/basic_information/English/sample_doc/emea_final_typical_case/Final list of EMEA documents.xlsx"
|
2024-10-08 22:16:01 +00:00
|
|
|
pdf_folder = r"/data/emea_ar/pdf/"
|
2024-11-05 17:14:56 +00:00
|
|
|
# download_pdf(
|
|
|
|
|
# doc_provider_file_path=doc_provider_file_path,
|
|
|
|
|
# sheet_name="Sheet1",
|
|
|
|
|
# doc_id_column="Document Id",
|
|
|
|
|
# pdf_path=pdf_folder)
|
2024-12-18 15:19:55 +00:00
|
|
|
|
2025-01-07 22:25:13 +00:00
|
|
|
pdf_folder = r"/data/aus_prospectus/pdf/"
|
|
|
|
|
output_folder = r"/data/aus_prospectus/pdf_txt/"
|
2025-01-16 00:22:08 +00:00
|
|
|
# output_pdf_page_text(pdf_folder, output_folder)
|
2024-08-23 21:38:11 +00:00
|
|
|
|
2024-08-22 15:37:56 +00:00
|
|
|
# extract_pdf_table(pdf_folder, output_folder)
|
|
|
|
|
# analyze_json_error()
|
|
|
|
|
|
2024-09-18 22:10:54 +00:00
|
|
|
latest_top_100_provider_ar_data_file = r"/data/emea_ar/basic_information/English/top_100_provider_latest_document_most_mapping/lux_english_ar_from_top_100_provider_latest_document_with_most_mappings.xlsx"
|
|
|
|
|
# download_pdf(latest_top_100_provider_ar_data_file,
|
|
|
|
|
# 'latest_ar_document_most_mapping',
|
|
|
|
|
# pdf_folder)
|
2024-10-28 20:15:55 +00:00
|
|
|
|
|
|
|
|
|
2024-12-11 22:49:04 +00:00
|
|
|
doc_ar_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_ar_data_12_11.xlsx"
|
|
|
|
|
doc_mapping_data_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
|
|
|
|
output_data_folder = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/"
|
2024-12-18 15:19:55 +00:00
|
|
|
output_file="doc_ar_data_sample_documents_12_11_statistics.xlsx"
|
|
|
|
|
|
2025-01-17 22:26:31 +00:00
|
|
|
# pdf_folder = r"/data/aus_prospectus/pdf/"
|
|
|
|
|
# doc_ar_data_file_path = None
|
|
|
|
|
# doc_mapping_data_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
|
|
|
|
# output_data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
|
|
|
|
|
# output_file = "aus_100_document_prospectus_multi_fund_statistics.xlsx"
|
2024-12-18 15:19:55 +00:00
|
|
|
# statistics_document(pdf_folder=pdf_folder,
|
|
|
|
|
# doc_mapping_file_path=doc_mapping_data_file_path,
|
|
|
|
|
# doc_ar_data_file_path=doc_ar_data_file_path,
|
2025-01-17 22:26:31 +00:00
|
|
|
# mapping_sheet_name="document_mapping",
|
|
|
|
|
# ar_data_sheet_name="aus_document_data",
|
2024-12-18 15:19:55 +00:00
|
|
|
# output_folder=output_data_folder,
|
|
|
|
|
# output_file=output_file)
|
2025-01-17 22:26:31 +00:00
|
|
|
|
2024-10-08 22:16:01 +00:00
|
|
|
# get_document_extracted_share_diff_by_db()
|
2024-08-22 15:37:56 +00:00
|
|
|
# statistics_provider_mapping(
|
|
|
|
|
# provider_mapping_data_file=provider_mapping_data_file,
|
|
|
|
|
# output_folder=basic_info_folder,
|
|
|
|
|
# )
|
2024-08-23 21:38:11 +00:00
|
|
|
# statistics_document_fund_share_count(doc_mapping_from_top_100_provider_file)
|
2024-09-18 22:10:54 +00:00
|
|
|
# pickup_document_from_top_100_providers()
|
|
|
|
|
# compare_records_count_by_document_id()
|
2024-09-23 22:21:02 +00:00
|
|
|
|
2024-09-26 17:18:37 +00:00
|
|
|
# document_mapping_folder = r"/data/emea_ar/output/mapping/document/"
|
|
|
|
|
# all_data_file = r"/data/emea_ar/output/mapping/all_document_mapping.xlsx"
|
|
|
|
|
# concat_mapping(document_mapping_folder, all_data_file)
|
2024-09-23 22:21:02 +00:00
|
|
|
|
2024-09-26 17:18:37 +00:00
|
|
|
# provider_mapping_folder = r"/data/emea_ar/output/mapping/provider/"
|
|
|
|
|
# all_data_file = r"/data/emea_ar/output/mapping/all_provider_mapping.xlsx"
|
|
|
|
|
# concat_mapping(provider_mapping_folder, all_data_file)
|
2024-09-23 22:21:02 +00:00
|
|
|
|