optimize configuration
This commit is contained in:
parent
17284c74f0
commit
92a26cd262
|
|
@ -119,7 +119,8 @@
|
||||||
"Betriebskostenquote des Fonds",
|
"Betriebskostenquote des Fonds",
|
||||||
"TER",
|
"TER",
|
||||||
"Total Expense Ratio",
|
"Total Expense Ratio",
|
||||||
"Total Expense Ratios"
|
"Total Expense Ratios",
|
||||||
|
"Total Expense Ratio in Prozent"
|
||||||
],
|
],
|
||||||
"dutch": [
|
"dutch": [
|
||||||
"Totale-kostenpercentage",
|
"Totale-kostenpercentage",
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@
|
||||||
"reported_name": {
|
"reported_name": {
|
||||||
"tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, Taux de rotation corrigé - Gecorrigeerde omloopsnelheid, etc.",
|
"tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, Taux de rotation corrigé - Gecorrigeerde omloopsnelheid, etc.",
|
||||||
"ogc": "The OGC reported name could be:\nOGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.",
|
"ogc": "The OGC reported name could be:\nOGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.",
|
||||||
"ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), etc.",
|
"ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), Net TER, Total Expense Ratio in Prozent, etc.",
|
||||||
"performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc."
|
"performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc."
|
||||||
},
|
},
|
||||||
"data_business_features": {
|
"data_business_features": {
|
||||||
|
|
@ -56,7 +56,9 @@
|
||||||
"- With \"TER (en %) (with performance)\" and \"TER(en %) (without performance)\", pick up the values from \"TER (en %) (with performance)\".",
|
"- With \"TER (en %) (with performance)\" and \"TER(en %) (without performance)\", pick up the values from \"TER (en %) (with performance)\".",
|
||||||
"- With \"TER including Performance Fee\" and \"TER excluding Performance Fee\", pick up the values from \"TER including Performance Fee\".",
|
"- With \"TER including Performance Fee\" and \"TER excluding Performance Fee\", pick up the values from \"TER including Performance Fee\".",
|
||||||
"- With both of \"Synthetic TER\" and \"Fund TER\", if \"Synthetic TER\" with value(s), pick up the value(s) from \"Synthetic TER\", otherwise, pick up the value(s) from \"Fund TER\".",
|
"- With both of \"Synthetic TER\" and \"Fund TER\", if \"Synthetic TER\" with value(s), pick up the value(s) from \"Synthetic TER\", otherwise, pick up the value(s) from \"Fund TER\".",
|
||||||
"- With both of \"Net TER (including reimbursement)\" and \"Capped Expense Ratio\", the priority is \"Capped Expense Ratio\", please exclude the column: \"Net TER (including reimbursement)\", only pick up the values from \"Capped Expense Ratio\".",
|
"- With both of \"Net TER\" and \"Capped Expense Ratio\", the priority is \"Net TER\", please exclude the column: \"Capped Expense Ratio\", only pick up the values from \"Net TER\".",
|
||||||
|
"- With \"Gross TER\", \"Waiver\", \"Net TER\", \"Capped Expense Ratio\" as column titles, pick up the values from \"Net TER\".",
|
||||||
|
"- If exist Gross TER as column title, please ignore this title",
|
||||||
"Please ignore TER values which with the exception of performance fees or excluded performance fees."
|
"Please ignore TER values which with the exception of performance fees or excluded performance fees."
|
||||||
],
|
],
|
||||||
"performance_fee": [
|
"performance_fee": [
|
||||||
|
|
|
||||||
46
main.py
46
main.py
|
|
@ -262,6 +262,11 @@ def batch_start_job(
|
||||||
):
|
):
|
||||||
pdf_files = glob(pdf_folder + "*.pdf")
|
pdf_files = glob(pdf_folder + "*.pdf")
|
||||||
doc_list = []
|
doc_list = []
|
||||||
|
for pdf_file in tqdm(pdf_files):
|
||||||
|
pdf_base_name = os.path.basename(pdf_file)
|
||||||
|
doc_id = pdf_base_name.split(".")[0]
|
||||||
|
doc_list.append(doc_id)
|
||||||
|
|
||||||
if special_doc_id_list is not None and len(special_doc_id_list) > 0:
|
if special_doc_id_list is not None and len(special_doc_id_list) > 0:
|
||||||
doc_list = special_doc_id_list
|
doc_list = special_doc_id_list
|
||||||
if (
|
if (
|
||||||
|
|
@ -276,11 +281,7 @@ def batch_start_job(
|
||||||
|
|
||||||
result_extract_data_list = []
|
result_extract_data_list = []
|
||||||
result_mapping_data_list = []
|
result_mapping_data_list = []
|
||||||
for pdf_file in tqdm(pdf_files):
|
for doc_id in tqdm(doc_list):
|
||||||
pdf_base_name = os.path.basename(pdf_file)
|
|
||||||
doc_id = pdf_base_name.split(".")[0]
|
|
||||||
if doc_list is not None and doc_id not in doc_list:
|
|
||||||
continue
|
|
||||||
doc_data_from_gpt, doc_mapping_data_list = mapping_data(
|
doc_data_from_gpt, doc_mapping_data_list = mapping_data(
|
||||||
doc_id=doc_id,
|
doc_id=doc_id,
|
||||||
pdf_folder=pdf_folder,
|
pdf_folder=pdf_folder,
|
||||||
|
|
@ -698,6 +699,7 @@ if __name__ == "__main__":
|
||||||
# "479793787",
|
# "479793787",
|
||||||
# "471641628",
|
# "471641628",
|
||||||
# ]
|
# ]
|
||||||
|
# English documents with ground truth
|
||||||
check_db_mapping_doc_id_list = [
|
check_db_mapping_doc_id_list = [
|
||||||
"292989214",
|
"292989214",
|
||||||
"316237292",
|
"316237292",
|
||||||
|
|
@ -747,6 +749,7 @@ if __name__ == "__main__":
|
||||||
"502693599"
|
"502693599"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Sample documents with special cases
|
||||||
# check_db_mapping_doc_id_list = [
|
# check_db_mapping_doc_id_list = [
|
||||||
# "334584772",
|
# "334584772",
|
||||||
# "406913630",
|
# "406913630",
|
||||||
|
|
@ -781,19 +784,42 @@ if __name__ == "__main__":
|
||||||
# "527525440",
|
# "527525440",
|
||||||
# "534535767"
|
# "534535767"
|
||||||
# ]
|
# ]
|
||||||
|
# Documents in EMEA Case 1.docx
|
||||||
|
check_db_mapping_doc_id_list = [
|
||||||
|
"435128656",
|
||||||
|
"425480144",
|
||||||
|
"466528487",
|
||||||
|
"434902020",
|
||||||
|
"440029306",
|
||||||
|
"431073795",
|
||||||
|
"430240853",
|
||||||
|
"427637151",
|
||||||
|
"434924914",
|
||||||
|
"467595142",
|
||||||
|
"466859621",
|
||||||
|
"429564034",
|
||||||
|
"424976833",
|
||||||
|
"466860852",
|
||||||
|
"466371135",
|
||||||
|
"470515549",
|
||||||
|
"434851173",
|
||||||
|
"434710819",
|
||||||
|
"429950833",
|
||||||
|
"467788879"
|
||||||
|
]
|
||||||
# special_doc_id_list = check_mapping_doc_id_list
|
# special_doc_id_list = check_mapping_doc_id_list
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["382366116"]
|
# special_doc_id_list = ["404712928"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
calculate_metrics = True
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
pdf_folder = r"/data/emea_ar/small_pdf/"
|
# pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||||
# pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
for extract_way in extract_ways:
|
for extract_way in extract_ways:
|
||||||
batch_start_job(
|
batch_start_job(
|
||||||
pdf_folder,
|
pdf_folder,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue