update to get more precise results

This commit is contained in:
Blade He 2024-09-12 16:00:49 -05:00
parent d56ac9482e
commit e17414173a
6 changed files with 117 additions and 38 deletions

View File

@ -256,9 +256,15 @@ class DataExtraction:
share_name = data.get("share name", "") share_name = data.get("share name", "")
if share_name != "": if share_name != "":
new_data["share_name"] = share_name new_data["share_name"] = share_name
ter = data.get("ter", None)
if ter is not None:
new_data["ter"] = ter
performance_fee = data.get("performance fees", None)
if performance_fee is not None:
new_data["performance_fee"] = performance_fee
for key, value in data.items(): for key, value in data.items():
if key not in ["fund name", "share name"]: if key not in ["fund name", "share name", "ter", "performance fees"]:
new_data[key] = value new_data[key] = value
new_data_list.append(new_data) new_data_list.append(new_data)

View File

@ -1,18 +1,12 @@
import os import os
import json import json
import json_repair
import re
import fitz
import pandas as pd import pandas as pd
from utils.gpt_utils import chat
from utils.pdf_util import PDFUtil
from utils.biz_utils import get_most_similar_name from utils.biz_utils import get_most_similar_name
from utils.sql_query_util import ( from utils.sql_query_util import (
query_document_fund_mapping, query_document_fund_mapping,
query_investment_by_provider, query_investment_by_provider,
) )
from utils.logger import logger from utils.logger import logger
from utils.biz_utils import add_slash_to_text_as_regex, clean_text
class DataMapping: class DataMapping:
@ -132,13 +126,13 @@ class DataMapping:
"doc_id": doc_id, "doc_id": doc_id,
"page_index": page_index, "page_index": page_index,
"raw_name": integrated_share_name, "raw_name": integrated_share_name,
"datapoint": datapoint,
"value": raw_data[datapoint],
"investment_type": 1,
"investment_id": "", "investment_id": "",
"investment_name": "", "investment_name": "",
"investment_type": 1,
"similarity": 0 "similarity": 0
} }
mapped_data["datapoint"] = datapoint
mapped_data["value"] = raw_data[datapoint]
mapped_data_list.append(mapped_data) mapped_data_list.append(mapped_data)
else: else:
raw_data_keys = list(raw_data.keys()) raw_data_keys = list(raw_data.keys())
@ -148,13 +142,12 @@ class DataMapping:
"doc_id": doc_id, "doc_id": doc_id,
"page_index": page_index, "page_index": page_index,
"raw_name": raw_fund_name, "raw_name": raw_fund_name,
"investment_id": "", "datapoint": datapoint,
"investment_name": "", "value": raw_data[datapoint],
"investment_type": 33, "investment_type": 33,
"similarity": 0 "investment_id": "",
"investment_name": ""
} }
mapped_data["datapoint"] = datapoint
mapped_data["value"] = raw_data[datapoint]
mapped_data_list.append(mapped_data) mapped_data_list.append(mapped_data)
else: else:
raw_name = "" raw_name = ""
@ -201,13 +194,13 @@ class DataMapping:
"doc_id": doc_id, "doc_id": doc_id,
"page_index": page_index, "page_index": page_index,
"raw_name": raw_name, "raw_name": raw_name,
"datapoint": datapoint,
"value": raw_data[datapoint],
"investment_type": investment_info["investment_type"],
"investment_id": investment_info["id"], "investment_id": investment_info["id"],
"investment_name": investment_info["legal_name"], "investment_name": investment_info["legal_name"],
"investment_type": investment_info["investment_type"],
"similarity": investment_info["similarity"] "similarity": investment_info["similarity"]
} }
mapped_data["datapoint"] = datapoint
mapped_data["value"] = raw_data[datapoint]
mapped_data_list.append(mapped_data) mapped_data_list.append(mapped_data)
json_data_file = os.path.join( json_data_file = os.path.join(

View File

@ -3,8 +3,7 @@ Context:
Instructions: Instructions:
Read the context carefully. Read the context carefully.
Maybe exists TOR, TER, performance fees, OGC data in the context. Maybe there are TOR, TER, performance fees, OGC data in the context.
The TOR reported name could be: The TOR reported name could be:
TOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, etc. TOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, etc.
@ -50,9 +49,12 @@ Here is the example:
GAMAX FUNDS FCP\nClass\nTER (excluding Performance Fees)\nTER (including Performance Fees)\nGAMAX FUNDS - ASIA PACIFIC\nA\n2.07%\n2.07%\n GAMAX FUNDS FCP\nClass\nTER (excluding Performance Fees)\nTER (including Performance Fees)\nGAMAX FUNDS - ASIA PACIFIC\nA\n2.07%\n2.07%\n
The output should be: The output should be:
{
"data":
[ [
{"fund name": "GAMAX FUNDS - ASIA PACIFIC", "share data": ["share name": "A", "ter": 2.07, "performance fees": 0]} {"fund name": "GAMAX FUNDS - ASIA PACIFIC", "share name": "A", "ter": 2.07, "performance fees": 0}
] ]
}
The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0 The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0
2. Combo TER value table. 2. Combo TER value table.
@ -66,10 +68,13 @@ Here is the example:
Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution Balanced Class X\n0.1475%\n0.7025%\n0.850%\n Feeder fund (share class)\nMaster fund\nTER\nFeeder\nTER Master\nTotal\nGlobal Portfolio Solution DKK -\nBalanced Class TI\nDanske Invest SICAV Global Portfolio\nSolution Balanced Class X\n0.1475%\n0.7025%\n0.850%\n
The output should be: The output should be:
{
"data":
[ [
{"fund name": "Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class TI", "ter": 0.1475]}, {"fund name": "Global Portfolio Solution DKK", "share name": "Balanced Class TI", "ter": 0.1475},
{"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share data": ["share name": "Balanced Class X", "ter": 0.7025]}, {"fund name": "Danske Invest SICAV Global Portfolio Solution DKK", "share name": "Balanced Class X", "ter": 0.7025}
] ]
}
3. Latest data with time series data 3. Latest data with time series data
Some data table is with multiple date columns, please extract the data from the latest date column: Some data table is with multiple date columns, please extract the data from the latest date column:
@ -79,9 +84,12 @@ The latest date-time column usually is the first "TOR, TER, performance fees, OG
Here is the example: Here is the example:
PERFORMANCE\nHISTORICAL PERFORMANCE\nHISTORICAL PERFORMANCE\nFrom \n1 July \nFrom \n19 July \nFrom \n1 January \nFrom \n27 April \nFrom \n19 July \nFrom \n1 January \n2021\nFrom \n22 May \n2021\nFrom \n16 July \n2021\nFrom \n21 September \n2021\nto 30 June 2023\nto 31 December 2022\nto 31 December 2021\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\n6.73%\n \n-13.32%\n \n \n 6.04%\n \n \n \n PERFORMANCE\nHISTORICAL PERFORMANCE\nHISTORICAL PERFORMANCE\nFrom \n1 July \nFrom \n19 July \nFrom \n1 January \nFrom \n27 April \nFrom \n19 July \nFrom \n1 January \n2021\nFrom \n22 May \n2021\nFrom \n16 July \n2021\nFrom \n21 September \n2021\nto 30 June 2023\nto 31 December 2022\nto 31 December 2021\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\n6.73%\n \n-13.32%\n \n \n 6.04%\n \n \n \n
The output should be: The output should be:
{
"data":
[ [
{"fund name": "Asia Total Return Fund", "share data": ["share name": "Class I5 (CHF Hedged) Acc", "performance fees": 6.73]}, {"fund name": "Asia Total Return Fund", "share name": "Class I5 (CHF Hedged) Acc", "performance fees": 6.73}
] ]
}
The keyword for performance fees is PERFORMANCE, the value 6.73 is the first number with the latest date-time. The keyword for performance fees is PERFORMANCE, the value 6.73 is the first number with the latest date-time.
4. TER reported name priority 4. TER reported name priority
@ -89,10 +97,12 @@ If exists both of Expense Ratio and Synthetic total Expense Ratio, please extrac
Output requirement: Output requirement:
1. If possible, please extract fund name, share name, TOR, TER, performance fees, OGC values as the output. 1. If possible, please extract fund name, share name, TOR, TER, performance fees, OGC values as the output.
2. The required output items are "fund name" and "share name". 2. If find share name, and exist relevant currency, please output share name + currency, e.g. share name is "Class A", currency is "USD", the output share name should be: "Class A USD".
3. Only output the dasta point which with relevant value. 3. Only output the dasta point which with relevant value.
4. fund level data: ("fund name" and "TOR") and share level data: ("fund name", "share name", "ter", "performance fees", "ogc") should be output separately. 4. fund level data: ("fund name" and "TOR") and share level data: ("fund name", "share name", "ter", "performance fees", "ogc") should be output separately.
5. The output should be JSON format, the format is like: 5. The output should be JSON format, the format is like:
{
"data":
[{ [{
"fund name": "fund 1", "fund name": "fund 1",
"TOR": 35.26 "TOR": 35.26
@ -107,19 +117,45 @@ Output requirement:
}, },
{ {
"fund name": "fund 1", "fund name": "fund 1",
"share data": [{"share name": "share 1", "ter": 1.23, "performance fees": 0.2, "ogc": 0.05},{"share name": "share 2", "ter": 2.56, "performance fees": 1.2, "ogc": 1.16}] "share name": "share 1",
"ter": 1.23,
"performance fees": 0.2,
"ogc": 0.05
},
{
"fund name": "fund 1",
"share name": "share 2",
"ter": 2.56,
"performance fees": 1.2,
"ogc": 1.16
}, },
{ {
"fund name": "fund 2", "fund name": "fund 2",
"share data": [{"share name": "share a", "ter": 1.16, "performance fees": -0.15},{"share name": "share b", "ter": 1.45}] "share name": "share a",
"ter": 1.16,
"performance fees": -0.15
},
{
"fund name": "fund 2",
"share name": "share b",
"ter": 1.45
}, },
{ {
"fund name": "fund 3", "fund name": "fund 3",
"share data": [{"share name": "share a", "performance fees": 0.57, "ogc": 0.18},{"share name": "share b", "performance fees": -0.11}] "share name": "share a",
}] "performance fees": 0.57,
"ogc": 0.18
},
{
"fund name": "fund 3",
"share name": "share b",
"performance fees": -0.11
}
]
}
Only output JSON data. Only output JSON data.
Don't output the value which not exist in context, especiall for fund level datapoint: TOR. Don't output the value which not exist in context, especiall for fund level datapoint: TOR.
If can't find share class name in context, please output empty JSON data: [] If can't find share class name in context, please output empty JSON data: {"data": []}
Answer: Answer:

View File

@ -3,7 +3,7 @@
"reported_name": { "reported_name": {
"tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, etc.", "tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, etc.",
"ogc": "The OGC reported name could be:\nOGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.", "ogc": "The OGC reported name could be:\nOGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.",
"ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, etc.", "ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), etc.",
"performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc." "performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc."
}, },
"data_business_features": { "data_business_features": {
@ -30,6 +30,7 @@
"ter": [ "ter": [
"If there are multiple TER value columns, here is the priority rules:", "If there are multiple TER value columns, here is the priority rules:",
"- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".", "- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".",
"- With \"TER (en %) (with performance)\" and \"TER(en %) (without performance)\", pick up the values from \"TER (en %) (with performance)\".",
"- With \"TER including Performance Fee\" and \"TER excluding Performance Fee\", pick up the values from \"TER including Performance Fee\".", "- With \"TER including Performance Fee\" and \"TER excluding Performance Fee\", pick up the values from \"TER including Performance Fee\".",
"- With both of \"Synthetic TER\" and \"Fund TER\", if \"Synthetic TER\" with value(s), pick up the value(s) from \"Synthetic TER\", otherwise, pick up the value(s) from \"Fund TER\".", "- With both of \"Synthetic TER\" and \"Fund TER\", if \"Synthetic TER\" with value(s), pick up the value(s) from \"Synthetic TER\", otherwise, pick up the value(s) from \"Fund TER\".",
"- With both of \"Net TER (including reimbursement)\" and \"Capped Expense Ratio\", the priority is \"Capped Expense Ratio\", please exclude the column: \"Net TER (including reimbursement)\", only pick up the values from \"Capped Expense Ratio\".", "- With both of \"Net TER (including reimbursement)\" and \"Capped Expense Ratio\", the priority is \"Capped Expense Ratio\", please exclude the column: \"Net TER (including reimbursement)\", only pick up the values from \"Capped Expense Ratio\".",
@ -49,7 +50,7 @@
"Here is the example:", "Here is the example:",
"performance fees\\nhistorical performance fees\\nhistorical performance fees\\nFrom \\n1 July \\nFrom \\n19 July \\nFrom \\n1 January \\nFrom \\n27 April \\nFrom \\n19 July \\nFrom \\n1 January \\n2021\\nFrom \\n22 May \\n2021\\nFrom \\n16 July \\n2021\\nFrom \\n21 September \\n2021\\nto 30 June 2023\\nto 31 December 2022\\nto 31 December 2021\\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\\n1.73%\\n \\n-1.32%\\n \\n \\n 2.04%\\n \\n \\n \\n", "performance fees\\nhistorical performance fees\\nhistorical performance fees\\nFrom \\n1 July \\nFrom \\n19 July \\nFrom \\n1 January \\nFrom \\n27 April \\nFrom \\n19 July \\nFrom \\n1 January \\n2021\\nFrom \\n22 May \\n2021\\nFrom \\n16 July \\n2021\\nFrom \\n21 September \\n2021\\nto 30 June 2023\\nto 31 December 2022\\nto 31 December 2021\\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\\n1.73%\\n \\n-1.32%\\n \\n \\n 2.04%\\n \\n \\n \\n",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Asia Total Return Fund\", \"share name\": \"Class I5 (CHF Hedged) Acc\", \"performance fees\": 1.73}]}", "{\"data\": [{\"fund name\": \"Asia Total Return Fund\", \"share name\": \"Class I5 (CHF Hedged) Acc\", \"performance_fee\": 1.73}]}",
"The keywords are performance fees, the value 1.73 is the first number with the latest date-time." "The keywords are performance fees, the value 1.73 is the first number with the latest date-time."
] ]
} }
@ -88,7 +89,7 @@
"Here is the example:", "Here is the example:",
"GAMAX FUNDS FCP\\nClass\\nTER (excluding Performance Fees)\\nTER (including Performance Fees)\\nGAMAX FUNDS - ASIA PACIFIC\\nA\\n2.07%\\n2.07%\\n", "GAMAX FUNDS FCP\\nClass\\nTER (excluding Performance Fees)\\nTER (including Performance Fees)\\nGAMAX FUNDS - ASIA PACIFIC\\nA\\n2.07%\\n2.07%\\n",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"GAMAX FUNDS - ASIA PACIFIC\", \"share name\": \"A\", \"ter\": 2.07, \"performance fees\": 0}]}", "{\"data\": [{\"fund name\": \"GAMAX FUNDS - ASIA PACIFIC\", \"share name\": \"A\", \"ter\": 2.07, \"performance_fee\": 0}]}",
"The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0" "The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0"
] ]
} }
@ -98,8 +99,11 @@
"common": [ "common": [
"If possible, please extract fund name, share name, TOR, TER, performance fees, OGC values as the output.", "If possible, please extract fund name, share name, TOR, TER, performance fees, OGC values as the output.",
"If find share name, and exist relevant currency, please output share name + currency, e.g. share name is \"Class A\", currency is \"USD\", the output share name should be: \"Class A USD\".", "If find share name, and exist relevant currency, please output share name + currency, e.g. share name is \"Class A\", currency is \"USD\", the output share name should be: \"Class A USD\".",
"Only output the dasta point which with relevant value.", "Only output the data point which with relevant value.",
"fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.", "Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
"Ignore the data point which with -, N/A, N/A%, N/A %, NONE, etc.",
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
"The output should be JSON format, the format is like below example(s):" "The output should be JSON format, the format is like below example(s):"
], ],
"fund_level": [ "fund_level": [
@ -127,7 +131,7 @@
1.16 1.16
], ],
"performance_fee_value": [ "performance_fee_value": [
0.2, 0,
-0.15, -0.15,
0.11 0.11
] ]

View File

@ -493,7 +493,7 @@ if __name__ == "__main__":
output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/" output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/" output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
re_run_extract_data = False re_run_extract_data = True
# batch_extract_data( # batch_extract_data(
# pdf_folder, # pdf_folder,
# page_filter_ground_truth_file, # page_filter_ground_truth_file,
@ -505,7 +505,7 @@ if __name__ == "__main__":
# doc_id = "476492237" # doc_id = "476492237"
# extract_data(doc_id, pdf_folder, output_extract_data_child_folder, re_run) # extract_data(doc_id, pdf_folder, output_extract_data_child_folder, re_run)
special_doc_id_list = [] special_doc_id_list = ["458291624"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_mapping_data = True re_run_mapping_data = True

View File

@ -200,6 +200,43 @@ def replace_abbrevation(text: str):
if text is None or len(text.strip()) == 0: if text is None or len(text.strip()) == 0:
return text return text
text = text.strip() text = text.strip()
if 'swiss franc' in text.lower():
text = re.sub(r'swiss\s+franc', 'CHF', text, flags=re.IGNORECASE)
elif 'us dollar' in text.lower():
text = re.sub(r'us\s+dollar', 'USD', text, flags=re.IGNORECASE)
elif 'singapore dollar' in text.lower():
text = re.sub(r'singapore\s+dollar', 'SGD', text, flags=re.IGNORECASE)
elif 'hong kong dollar' in text.lower():
text = re.sub(r'hong\s+kong\s+dollar', 'HKD', text, flags=re.IGNORECASE)
elif 'hongkong dollar' in text.lower():
text = re.sub(r'hongkong\s+dollar', 'HKD', text, flags=re.IGNORECASE)
elif 'australian dollar' in text.lower():
text = re.sub(r'australian\s+dollar', 'AUD', text, flags=re.IGNORECASE)
elif 'japanese yen' in text.lower():
text = re.sub(r'japanese\s+yen', 'JPY', text, flags=re.IGNORECASE)
elif 'south african rand' in text.lower():
text = re.sub(r'South\s+African\s+rand', 'ZAR', text, flags=re.IGNORECASE)
elif 'canadian dollar' in text.lower():
text = re.sub(r'canadian\s+dollar', 'CAD', text, flags=re.IGNORECASE)
elif 'new zealand dollar' in text.lower():
text = re.sub(r'new\s+zealand\s+dollar', 'NZD', text, flags=re.IGNORECASE)
elif 'norwegian krone' in text.lower():
text = re.sub(r'norwegian\s+krone', 'NOK', text, flags=re.IGNORECASE)
elif 'danish krone' in text.lower():
text = re.sub(r'danish\s+krone', 'DKK', text, flags=re.IGNORECASE)
elif 'swedish krona' in text.lower():
text = re.sub(r'swedish\s+krona', 'SEK', text, flags=re.IGNORECASE)
elif 'swedish kronor' in text.lower():
text = re.sub(r'swedish\s+kronor', 'SEK', text, flags=re.IGNORECASE)
elif 'sterling' in text.lower().split():
text = re.sub(r'sterling', 'GBP', text, flags=re.IGNORECASE)
elif 'euro' in text.lower().split():
text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE)
elif '' in text.lower().split():
text = re.sub(r'\', 'EUR', text, flags=re.IGNORECASE)
else:
pass
text_splits = text.split() text_splits = text.split()
new_text_splits = [] new_text_splits = []
for split in text_splits: for split in text_splits:
@ -213,6 +250,8 @@ def replace_abbrevation(text: str):
new_text_splits.append('Investor') new_text_splits.append('Investor')
elif split.lower() in ['inst', 'inst', 'institution']: elif split.lower() in ['inst', 'inst', 'institution']:
new_text_splits.append('Institutional') new_text_splits.append('Institutional')
elif split.lower() in ['cap']:
new_text_splits.append('Capitalisation')
elif split.lower() in ['adm']: elif split.lower() in ['adm']:
new_text_splits.append('Admin') new_text_splits.append('Admin')
elif split.lower() in ['adv']: elif split.lower() in ['adv']:
@ -229,5 +268,6 @@ def replace_abbrevation(text: str):
new_text_splits.append('no trail') new_text_splits.append('no trail')
else: else:
new_text_splits.append(split) new_text_splits.append(split)
new_text = ' '.join(new_text_splits) new_text = ' '.join(new_text_splits)
return new_text return new_text