a lot of functions to prepare data.

This commit is contained in:
Blade He 2024-08-22 10:37:56 -05:00
parent f91e0cf1a8
commit 993664cf78
1 changed files with 592 additions and 23 deletions

View File

@ -1,6 +1,6 @@
import pandas as pd import pandas as pd
import os import os
import tqdm from tqdm import tqdm
import json import json
from glob import glob from glob import glob
import fitz import fitz
@ -14,24 +14,37 @@ from utils.pdf_download import download_pdf_from_documents_warehouse
from utils.pdf_util import PDFUtil from utils.pdf_util import PDFUtil
from pdf_table_extraction import PDFTableExtraction from pdf_table_extraction import PDFTableExtraction
def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str): def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
doc_provider_data = pd.read_excel(doc_provider_file_path) doc_provider_data = pd.read_excel(doc_provider_file_path)
# get new data by grouping by docid, and count the number of rows for each docid, # get new data by grouping by docid, and count the number of rows for each docid,
# set the new data with 2 columns: docid and provider_count # set the new data with 2 columns: docid and provider_count
doc_provider_count = doc_provider_data.groupby('DocumentId').size().reset_index(name='provider_count') doc_provider_count = (
doc_provider_data.groupby("DocumentId")
.size()
.reset_index(name="provider_count")
)
# sort new data by provider_count in descending order # sort new data by provider_count in descending order
doc_provider_count = doc_provider_count.sort_values(by='provider_count', ascending=False) doc_provider_count = doc_provider_count.sort_values(
by="provider_count", ascending=False
)
# save excel by doc_provider_data and new_data # save excel by doc_provider_data and new_data
with pd.ExcelWriter(doc_provider_file_path) as writer: with pd.ExcelWriter(doc_provider_file_path) as writer:
doc_provider_data.to_excel(writer, sheet_name='doc_provider_details', index=False) doc_provider_data.to_excel(
doc_provider_count.to_excel(writer, sheet_name='doc_provider_count', index=False) writer, sheet_name="doc_provider_details", index=False
)
doc_provider_count.to_excel(
writer, sheet_name="doc_provider_count", index=False
)
def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str): def download_pdf(doc_provider_file_path: str, sheet_name: str, pdf_path: str):
document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name) document_data = pd.read_excel(doc_provider_file_path, sheet_name=sheet_name)
# get all unique docids as list # get all unique docids as list
doc_id_list = [str(doc_id) for doc_id doc_id_list = [
in document_data['DocumentId'].unique().tolist()] str(doc_id) for doc_id in document_data["DocumentId"].unique().tolist()
]
# download pdfs # download pdfs
logger.info(f"Start downloading {len(doc_id_list)} pdfs") logger.info(f"Start downloading {len(doc_id_list)} pdfs")
os.makedirs(pdf_path, exist_ok=True) os.makedirs(pdf_path, exist_ok=True)
@ -48,14 +61,16 @@ def output_pdf_page_text(pdf_folder: str, output_folder: str):
if output_folder is None or len(output_folder) == 0: if output_folder is None or len(output_folder) == 0:
logger.error(f"Invalid output_folder: {output_folder}") logger.error(f"Invalid output_folder: {output_folder}")
return return
os.makedirs(output_folder, exist_ok=True) os.makedirs(output_folder, exist_ok=True)
pdf_files = glob(os.path.join(pdf_folder, '*.pdf')) pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
for pdf_file in pdf_files: for pdf_file in pdf_files:
logger.info(f"Start processing {pdf_file}") logger.info(f"Start processing {pdf_file}")
pdf_util = PDFUtil(pdf_file) pdf_util = PDFUtil(pdf_file)
success, text, page_text_dict = pdf_util.extract_text(output_folder=output_folder) success, text, page_text_dict = pdf_util.extract_text(
output_folder=output_folder
)
if success: if success:
logger.info(f"Successfully extracted text from {pdf_file}") logger.info(f"Successfully extracted text from {pdf_file}")
@ -68,8 +83,8 @@ def extract_pdf_table(pdf_folder: str, output_folder: str):
logger.error(f"Invalid output_folder: {output_folder}") logger.error(f"Invalid output_folder: {output_folder}")
return return
os.makedirs(output_folder, exist_ok=True) os.makedirs(output_folder, exist_ok=True)
pdf_files = glob(os.path.join(pdf_folder, '*.pdf')) pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}") logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
for pdf_file in pdf_files: for pdf_file in pdf_files:
logger.info(f"Start processing {pdf_file}") logger.info(f"Start processing {pdf_file}")
@ -79,9 +94,9 @@ def extract_pdf_table(pdf_folder: str, output_folder: str):
def analyze_json_error(): def analyze_json_error():
text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt" text_file = r"/data/emea_ar/output/pdf_table_prompts/445877368_4.txt"
with open(text_file, 'r', encoding='utf-8') as file: with open(text_file, "r", encoding="utf-8") as file:
text = file.read() text = file.read()
json_response = re.search(r'\`\`\`json([\s\S]*)\`\`\`', text) json_response = re.search(r"\`\`\`json([\s\S]*)\`\`\`", text)
if json_response: if json_response:
json_text = json_response.group(1) json_text = json_response.group(1)
json_data = {"tables": []} json_data = {"tables": []}
@ -89,20 +104,574 @@ def analyze_json_error():
json_data = json.loads(json_text) json_data = json.loads(json_text)
except: except:
json_data = json_repair.loads(json_text) json_data = json_repair.loads(json_text)
table_list = json_data.get('tables', []) table_list = json_data.get("tables", [])
for table_num, table in enumerate(table_list): for table_num, table in enumerate(table_list):
table_md_file = os.path.join("/temp/", f'temp_{table_num}.md') table_md_file = os.path.join("/temp/", f"temp_{table_num}.md")
table = re.sub(r'(\n)+', '\n', table) table = re.sub(r"(\n)+", "\n", table)
with open(table_md_file, 'w', encoding='utf-8') as file: with open(table_md_file, "w", encoding="utf-8") as file:
file.write(table) file.write(table)
if __name__ == '__main__': def statistics_document(
doc_provider_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx" pdf_folder: str, doc_mapping_file_path: str, output_folder: str
):
if pdf_folder is None or len(pdf_folder) == 0 or not os.path.exists(pdf_folder):
logger.error(f"Invalid pdf_folder: {pdf_folder}")
return
if (
doc_mapping_file_path is None
or len(doc_mapping_file_path) == 0
or not os.path.exists(doc_mapping_file_path)
):
logger.error(f"Invalid doc_mapping_file_path: {doc_mapping_file_path}")
return
if output_folder is None or len(output_folder) == 0:
logger.error(f"Invalid output_folder: {output_folder}")
return
os.makedirs(output_folder, exist_ok=True)
describe_stat_df_list = []
# statistics document mapping information
doc_mapping_data = pd.read_excel(doc_mapping_file_path, sheet_name="all_data")
# statistics doc_mapping_data for counting FundId count based on DocumentId
logger.info(
"statistics doc_mapping_data for counting FundId count based on DocumentId"
)
doc_fund_id_df = doc_mapping_data[["DocumentId", "FundId"]].drop_duplicates()
doc_fund_count = (
doc_fund_id_df.groupby("DocumentId").size().reset_index(name="fund_count")
)
# order by fund_count in descending order
doc_fund_count = doc_fund_count.sort_values(by="fund_count", ascending=False)
# statistics fund_count in doc_fund_count by describe and transform to DataFrame
doc_fund_count_stat_df = get_describe_stat(
doc_fund_count, "fund_count", "doc_fund_count"
)
describe_stat_df_list.append(doc_fund_count_stat_df)
# statistics doc_mapping_data for counting FundClassId count based on DocumentId
logger.info(
"statistics doc_mapping_data for counting FundClassId count based on DocumentId"
)
doc_share_class_id_df = doc_mapping_data[
["DocumentId", "FundClassId"]
].drop_duplicates()
doc_share_class_count = (
doc_share_class_id_df.groupby("DocumentId")
.size()
.reset_index(name="share_class_count")
)
# order by share_class_count in descending order
doc_share_class_count = doc_share_class_count.sort_values(
by="share_class_count", ascending=False
)
# statistics share_class_count in doc_share_class_count by describe and transform to DataFrame
doc_share_class_count_stat_df = get_describe_stat(
doc_share_class_count, "share_class_count", "doc_share_class_count"
)
describe_stat_df_list.append(doc_share_class_count_stat_df)
# statistics doc_mapping_data for counting FundId count based on ProviderCompanyId and CompanyName
logger.info(
"statistics doc_mapping_data for counting FundId count based on ProviderCompanyId and CompanyName"
)
provider_fund_id_df = doc_mapping_data[
["ProviderCompanyId", "CompanyName", "FundId"]
].drop_duplicates()
provider_fund_count = (
provider_fund_id_df.groupby(["ProviderCompanyId", "CompanyName"])
.size()
.reset_index(name="fund_count")
)
# order by fund_count in descending order
provider_fund_count = provider_fund_count.sort_values(
by="fund_count", ascending=False
)
# statistics fund_count in provider_fund_count by describe and transform to DataFrame
provider_fund_count_stat_df = get_describe_stat(
provider_fund_count, "fund_count", "provider_fund_count"
)
describe_stat_df_list.append(provider_fund_count_stat_df)
# statistics doc_mapping_data for counting FundClassId count based on ProviderCompanyId
logger.info(
"statistics doc_mapping_data for counting FundClassId count based on ProviderCompanyId"
)
provider_share_class_id_df = doc_mapping_data[
["ProviderCompanyId", "CompanyName", "FundClassId"]
].drop_duplicates()
provider_share_class_count = (
provider_share_class_id_df.groupby(["ProviderCompanyId", "CompanyName"])
.size()
.reset_index(name="share_class_count")
)
# order by share_class_count in descending order
provider_share_class_count = provider_share_class_count.sort_values(
by="share_class_count", ascending=False
)
# statistics share_class_count in provider_share_class_count by describe and transform to DataFrame
provider_share_class_count_stat_df = get_describe_stat(
provider_share_class_count, "share_class_count", "provider_share_class_count"
)
describe_stat_df_list.append(provider_share_class_count_stat_df)
# statistics doc_mapping_data for counting FundClassId count based on FundId and FundLegalName
logger.info(
"statistics doc_mapping_data for counting FundClassId count based on FundId and FundLegalName"
)
fund_share_class_id_df = doc_mapping_data[
["FundId", "FundLegalName", "FundClassId"]
].drop_duplicates()
fund_share_class_count = (
fund_share_class_id_df.groupby(["FundId", "FundLegalName"])
.size()
.reset_index(name="share_class_count")
)
# order by share_class_count in fund_share_class_count
fund_share_class_count = fund_share_class_count.sort_values(
by="share_class_count", ascending=False
)
# statistics share_class_count in fund_share_class_count by describe and transform to DataFrame
fund_share_class_count_stat_df = get_describe_stat(
fund_share_class_count, "share_class_count", "fund_share_class_count"
)
describe_stat_df_list.append(fund_share_class_count_stat_df)
stat_file = os.path.join(output_folder, "doc_mapping_statistics_data.xlsx")
# statistics document page number
pdf_files = glob(os.path.join(pdf_folder, "*.pdf"))
logger.info(f"Total {len(pdf_files)} pdf files found in {pdf_folder}")
logger.info("statistics document page number")
doc_page_num_list = []
for pdf_file in tqdm(pdf_files):
docid = os.path.basename(pdf_file).split(".")[0]
doc = fitz.open(pdf_file)
page_num = doc.page_count
doc_page_num_list.append({"docid": docid, "page_num": page_num})
doc.close()
doc_page_num_df = pd.DataFrame(doc_page_num_list)
# order by page_num in descending order
doc_page_num_df = doc_page_num_df.sort_values(by="page_num", ascending=False)
# statistics page_num by describe and transform to DataFrame
doc_page_num_stat_df = get_describe_stat(
doc_page_num_df, "page_num", "doc_page_num"
)
describe_stat_df_list.append(doc_page_num_stat_df)
describe_stat_df = pd.concat(describe_stat_df_list)
describe_stat_df.reset_index(drop=True, inplace=True)
# save statistics data to excel
with pd.ExcelWriter(stat_file) as writer:
doc_page_num_df.to_excel(writer, sheet_name="doc_page_num", index=False)
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
doc_share_class_count.to_excel(
writer, sheet_name="doc_share_class_count", index=False
)
provider_fund_count.to_excel(
writer, sheet_name="provider_fund_count", index=False
)
provider_share_class_count.to_excel(
writer, sheet_name="provider_share_class_count", index=False
)
fund_share_class_count.to_excel(
writer, sheet_name="fund_share_class_count", index=False
)
describe_stat_df.to_excel(
writer, sheet_name="all_describe_statistics", index=False
)
def statistics_provider_mapping(provider_mapping_data_file: str, output_folder: str):
if (
provider_mapping_data_file is None
or len(provider_mapping_data_file) == 0
or not os.path.exists(provider_mapping_data_file)
):
logger.error(
f"Invalid provider_mapping_data_file: {provider_mapping_data_file}"
)
return
provider_mapping_data = pd.read_excel(provider_mapping_data_file)
describe_stat_df_list = []
# statistics provider_mapping_data for counting FundId count based on CompanyId and CompanyName
logger.info(
"statistics provider_mapping_data for counting FundId count based on CompanyId and CompanyName"
)
provider_fund_id_df = provider_mapping_data[
["CompanyId", "CompanyName", "FundId"]
].drop_duplicates()
provider_fund_count = (
provider_fund_id_df.groupby(["CompanyId", "CompanyName"])
.size()
.reset_index(name="fund_count")
)
# order by fund_count in descending order
provider_fund_count = provider_fund_count.sort_values(
by="fund_count", ascending=False
)
# statistics fund_count in provider_fund_count by describe and transform to DataFrame
provider_fund_count_stat_df = get_describe_stat(
provider_fund_count, "fund_count", "provider_fund_count"
)
describe_stat_df_list.append(provider_fund_count_stat_df)
# Get the fund_count sum of all companies
all_companies_fund_count_sum = provider_fund_count["fund_count"].sum()
top_n_company_fund_count_list = []
# Get the fund_count sum of top 5 companies
top_5_companies_fund_count, top_5_companies_fund_count_percent = (
get_top_n_records_count(
provider_fund_count, "fund_count", 5, all_companies_fund_count_sum
)
)
top_n_company_fund_count_list.append(
{
"top_n_providers": 5,
"fund_count": top_5_companies_fund_count,
"percent": top_5_companies_fund_count_percent,
}
)
logger.info(f"Top 5 companies fund count sum: {top_5_companies_fund_count}")
# Get the fund_count sum of top 10 companies
top_10_companies_fund_count, top_10_companies_fund_count_percent = (
get_top_n_records_count(
provider_fund_count, "fund_count", 10, all_companies_fund_count_sum
)
)
top_n_company_fund_count_list.append(
{
"top_n_providers": 10,
"fund_count": top_10_companies_fund_count,
"percent": top_10_companies_fund_count_percent,
}
)
logger.info(f"Top 10 companies fund count sum: {top_10_companies_fund_count}")
# Get the fund_count sum of top 50 companies
top_50_companies_fund_count, top_50_companies_fund_count_percent = (
get_top_n_records_count(
provider_fund_count, "fund_count", 50, all_companies_fund_count_sum
)
)
top_n_company_fund_count_list.append(
{
"top_n_providers": 50,
"fund_count": top_50_companies_fund_count,
"percent": top_50_companies_fund_count_percent,
}
)
logger.info(f"Top 50 companies fund count sum: {top_50_companies_fund_count}")
# Get the fund_count sum of top 100 companies
top_100_companies_fund_count, top_100_companies_fund_count_percent = (
get_top_n_records_count(
provider_fund_count, "fund_count", 100, all_companies_fund_count_sum
)
)
top_n_company_fund_count_list.append(
{
"top_n_providers": 100,
"fund_count": top_100_companies_fund_count,
"percent": top_100_companies_fund_count_percent,
}
)
top_n_company_fund_count_list.append(
{
"top_n_providers": len(provider_fund_count),
"fund_count": all_companies_fund_count_sum,
"percent": 100,
}
)
logger.info(f"Top 100 companies fund count sum: {top_100_companies_fund_count}")
top_n_company_fund_count_df = pd.DataFrame(top_n_company_fund_count_list)
# statistics provider_mapping_data for counting FundClassId count based on CompanyId and CompanyName
logger.info(
"statistics provider_mapping_data for counting SecId count based on CompanyId and CompanyName"
)
provider_share_class_id_df = provider_mapping_data[
["CompanyId", "CompanyName", "SecId"]
].drop_duplicates()
provider_share_class_count = (
provider_share_class_id_df.groupby(["CompanyId", "CompanyName"])
.size()
.reset_index(name="share_class_count")
)
# order by share_class_count in descending order
provider_share_class_count = provider_share_class_count.sort_values(
by="share_class_count", ascending=False
)
# statistics share_class_count in provider_share_class_count by describe and transform to DataFrame
provider_share_class_count_stat_df = get_describe_stat(
provider_share_class_count, "share_class_count", "provider_share_class_count"
)
describe_stat_df_list.append(provider_share_class_count_stat_df)
# Get the fund_count sum of all companies
all_companies_share_class_count_sum = provider_share_class_count[
"share_class_count"
].sum()
top_n_company_share_class_count_list = []
# Get the fund_count sum of top 5 companies
top_5_companies_share_class_count, top_5_companies_share_class_count_percent = (
get_top_n_records_count(
provider_share_class_count,
"share_class_count",
5,
all_companies_share_class_count_sum,
)
)
top_n_company_share_class_count_list.append(
{
"top_n_providers": 5,
"share_class_count": top_5_companies_share_class_count,
"percent": top_5_companies_share_class_count_percent,
}
)
logger.info(
f"Top 5 companies share class count sum: {top_5_companies_share_class_count}"
)
# Get the fund_count sum of top 10 companies
top_10_companies_share_class_count, top_10_companies_share_class_count_percent = (
get_top_n_records_count(
provider_share_class_count,
"share_class_count",
10,
all_companies_share_class_count_sum,
)
)
top_n_company_share_class_count_list.append(
{
"top_n_providers": 10,
"share_class_count": top_10_companies_share_class_count,
"percent": top_10_companies_share_class_count_percent,
}
)
logger.info(
f"Top 10 companies share class count sum: {top_10_companies_share_class_count}"
)
# Get the fund_count sum of top 50 companies
top_50_companies_share_class_count, top_50_companies_share_class_count_percent = (
get_top_n_records_count(
provider_share_class_count,
"share_class_count",
50,
all_companies_share_class_count_sum,
)
)
top_n_company_share_class_count_list.append(
{
"top_n_providers": 50,
"share_class_count": top_50_companies_share_class_count,
"percent": top_50_companies_share_class_count_percent,
}
)
logger.info(
f"Top 50 companies share class count sum: {top_50_companies_share_class_count}"
)
# Get the fund_count sum of top 100 companies
top_100_companies_share_class_count, top_100_companies_share_class_count_percent = (
get_top_n_records_count(
provider_share_class_count,
"share_class_count",
100,
all_companies_share_class_count_sum,
)
)
top_n_company_share_class_count_list.append(
{
"top_n_providers": 100,
"share_class_count": top_100_companies_share_class_count,
"percent": top_100_companies_share_class_count_percent,
}
)
logger.info(
f"Top 100 companies share class count sum: {top_100_companies_share_class_count}"
)
top_n_company_share_class_count_list.append(
{
"top_n_providers": len(provider_share_class_count),
"share_class_count": all_companies_share_class_count_sum,
"percent": 100,
}
)
top_n_company_share_class_count_df = pd.DataFrame(
top_n_company_share_class_count_list
)
# statistics provider_mapping_data for counting SecId count based on FundId and FundLegalName
logger.info(
"statistics provider_mapping_data for counting SecId count based on FundId and FundLegalName"
)
fund_share_class_id_df = provider_mapping_data[
["FundId", "FundLegalName", "SecId"]
].drop_duplicates()
fund_share_class_count = (
fund_share_class_id_df.groupby(["FundId", "FundLegalName"])
.size()
.reset_index(name="share_class_count")
)
# order by share_class_count in fund_share_class_count
fund_share_class_count = fund_share_class_count.sort_values(
by="share_class_count", ascending=False
)
# statistics share_class_count in fund_share_class_count by describe and transform to DataFrame
fund_share_class_count_stat_df = get_describe_stat(
fund_share_class_count, "share_class_count", "fund_share_class_count"
)
describe_stat_df_list.append(fund_share_class_count_stat_df)
describe_stat_df = pd.concat(describe_stat_df_list)
describe_stat_df.reset_index(drop=True, inplace=True)
stat_file = os.path.join(output_folder, "provider_mapping_data_statistics.xlsx")
# save statistics data to excel
with pd.ExcelWriter(stat_file) as writer:
top_n_company_fund_count_df.to_excel(
writer, sheet_name="top_n_provider_fund_count", index=False
)
top_n_company_share_class_count_df.to_excel(
writer, sheet_name="top_n_provider_share_count", index=False
)
provider_fund_count.to_excel(
writer, sheet_name="provider_fund_count", index=False
)
provider_share_class_count.to_excel(
writer, sheet_name="provider_share_count", index=False
)
fund_share_class_count.to_excel(
writer, sheet_name="fund_share_count", index=False
)
describe_stat_df.to_excel(
writer, sheet_name="all_describe_statistics", index=False
)
def statistics_document_fund_share_count(provider_mapping_data_file: str):
if (
provider_mapping_data_file is None
or len(provider_mapping_data_file) == 0
or not os.path.exists(provider_mapping_data_file)
):
logger.error(f"Invalid file_path: {provider_mapping_data_file}")
return
describe_stat_df_list = []
# statistics document mapping information
doc_mapping_data = pd.read_excel(provider_mapping_data_file, sheet_name="all_data")
# statistics doc_mapping_data for counting FundId count based on DocumentId
logger.info(
"statistics doc_mapping_data for counting FundId count based on DocumentId"
)
doc_fund_id_df = doc_mapping_data[["DocumentId", "CompanyId", "CompanyName", "FundId"]].drop_duplicates()
doc_fund_count = (
doc_fund_id_df.groupby(["DocumentId", "CompanyId", "CompanyName"]).size().reset_index(name="fund_count")
)
# order by fund_count in descending order
doc_fund_count = doc_fund_count.sort_values(by="fund_count", ascending=True)
# statistics fund_count in doc_fund_count by describe and transform to DataFrame
doc_fund_count_stat_df = get_describe_stat(
doc_fund_count, "fund_count", "doc_fund_count"
)
describe_stat_df_list.append(doc_fund_count_stat_df)
# statistics doc_mapping_data for counting FundClassId count based on DocumentId
logger.info(
"statistics doc_mapping_data for counting FundClassId count based on DocumentId"
)
doc_share_class_id_df = doc_mapping_data[
["DocumentId", "CompanyId", "CompanyName", "FundClassId"]
].drop_duplicates()
doc_share_class_count = (
doc_share_class_id_df.groupby(["DocumentId", "CompanyId", "CompanyName"])
.size()
.reset_index(name="share_class_count")
)
# order by share_class_count in descending order
doc_share_class_count = doc_share_class_count.sort_values(
by="share_class_count", ascending=True
)
# statistics share_class_count in doc_share_class_count by describe and transform to DataFrame
doc_share_class_count_stat_df = get_describe_stat(
doc_share_class_count, "share_class_count", "doc_share_class_count"
)
describe_stat_df_list.append(doc_share_class_count_stat_df)
describe_stat_df = pd.concat(describe_stat_df_list)
describe_stat_df.reset_index(drop=True, inplace=True)
with pd.ExcelWriter(provider_mapping_data_file) as writer:
doc_mapping_data.to_excel(writer, sheet_name="all_data", index=False)
doc_fund_count.to_excel(writer, sheet_name="doc_fund_count", index=False)
doc_share_class_count.to_excel(writer, sheet_name="doc_share_class_count", index=False)
describe_stat_df.to_excel(writer, sheet_name="all_describe_statistics", index=False)
def get_top_n_records_count(
df: pd.DataFrame, column_name: str, n: int, total_count: int
):
top_n_records = df.head(n)
top_n_records_count = top_n_records[column_name].sum()
top_n_records_count_percent = round((top_n_records_count / total_count) * 100, 2)
return top_n_records_count, top_n_records_count_percent
def get_describe_stat(df: pd.DataFrame, column_name: str, stat_type_name: str):
stat_df = df[column_name].describe().reset_index().T
stat_df.columns = ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
stat_df.reset_index(inplace=True)
stat_df.rename(columns={"index": "Stat"}, inplace=True)
# remove the first row
stat_df = stat_df[1:]
if stat_type_name is not None:
stat_df["Stat_Type"] = stat_type_name
stat_df = stat_df[
[
"Stat_Type",
"count",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
]
]
return stat_df
if __name__ == "__main__":
doc_provider_file_path = (
r"/data/emea_ar/basic_information/English/latest_provider_ar_document.xlsx"
)
doc_mapping_file_path = r"/data/emea_ar/basic_information/English/latest_provider_ar_document_mapping.xlsx"
provider_mapping_data_file = (
r"/data/emea_ar/basic_information/English/provider_mapping_data.xlsx"
)
doc_mapping_from_top_100_provider_file = (
r"/data/emea_ar/basic_information/English/lux_english_ar_from_top_100_provider_since_2020.xlsx"
)
basic_info_folder = r"/data/emea_ar/basic_information/English/"
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
output_folder = r"/data/emea_ar/output/" output_folder = r"/data/emea_ar/output/"
# get_unique_docids_from_doc_provider_data(doc_provider_file_path) # get_unique_docids_from_doc_provider_data(doc_provider_file_path)
# download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder) # download_pdf(doc_provider_file_path, 'doc_provider_count', pdf_folder)
# output_pdf_page_text(pdf_folder, output_folder) # output_pdf_page_text(pdf_folder, output_folder)
extract_pdf_table(pdf_folder, output_folder) # extract_pdf_table(pdf_folder, output_folder)
# analyze_json_error() # analyze_json_error()
# statistics_document(pdf_folder, doc_mapping_file_path, basic_info_folder)
# statistics_provider_mapping(
# provider_mapping_data_file=provider_mapping_data_file,
# output_folder=basic_info_folder,
# )
statistics_document_fund_share_count(doc_mapping_from_top_100_provider_file)