dc-ml-emea-ar/core/auz_nz/hybrid_solution_script.py

741 lines
43 KiB
Python
Raw Normal View History

import pandas as pd
import os
import json
import json_repair
import pandas as pd
import math
import ast
from .string_similarity import get_cosine_similarity, get_jaccard_similarity, get_levenshtien_distance_score
import nltk
from nltk.corpus import stopwords
# from dotenv import load_dotenv
from collections import Counter
import re
from utils.gpt_utils import chat
from utils.logger import logger
# gpt_call = GPTAPI()
# Download the stopwords list if not already downloaded
nltk.download('stopwords')
import json
from openai import AzureOpenAI
# load_dotenv()
# API_KEY = os.getenv("API_KEY")
# MODEL = os.getenv("MODEL")
# END_POINT = os.getenv("END_POINT")
# API_VERSION = os.getenv("API_VERSION")
### STEP 1 - Abbreviation Replacement
ABB_JSON = dict()
def get_abb_json(doc_source: str = "aus_prospectus"):
global ABB_JSON
2025-01-21 23:09:45 +00:00
if len(ABB_JSON.keys()) == 0:
with open(f"./configuration/{doc_source}/abbreviation_records.json", "r") as file:
2025-01-21 23:09:45 +00:00
# Load the JSON and convert keys to lowercase
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
def get_abbre_format_str(fundname, doc_source: str = "aus_prospectus"):
"""Replaces abbreviations in a fund name with their expanded forms."""
# Convert fund name to lowercase while matching
# replace special characters with space
f_list = re.sub(r'[^a-zA-Z0-9\s]', ' ', fundname).lower().split()
# f_list = fundname.lower().split()
get_abb_json(doc_source)
updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
return " ".join(updated_doc_fname_words)
def replace_abbrevs_in_fundnames(fund_names_list, doc_source: str = "aus_prospectus"):
"""Replaces abbreviations in a list of fund names."""
return [get_abbre_format_str(fund_name, doc_source) for fund_name in fund_names_list]
### STEP 2 - Remove Stopwords
# Function to clean fund names using NLTK stopwords
def remove_stopwords_nltk(fund_names):
nltk_stopwords = set(stopwords.words('english'))
# Add custom words if necessary (e.g., fund-related stopwords)
custom_stopwords = {'inc', 'fund', 'lp', 'llc', 'plc'}
final_stopwords = nltk_stopwords.union(custom_stopwords)
def stopword_clean(fund_name, stopwords):
words = re.split(r'\W+', fund_name.lower())
filtered_words = [word for word in words if word not in stopwords and word.strip() != '']
cleaned_fund_name = ' '.join(filtered_words).title() # Return cleaned name in title case
return cleaned_fund_name
cleaned_fund_names = [stopword_clean(fund, final_stopwords) for fund in fund_names]
return cleaned_fund_names
### STEP 3 - Special characters removal
def remove_special_characters(fund_group):
fund_names = [re.sub(r'[^a-zA-Z0-9\s]', ' ', txt_fund).strip() for txt_fund in fund_group]
return fund_names
### STEP 4 - Common words removal
def remove_common_words(fund_list, common_words=None):
if len(fund_list)>2 or common_words:
# Step 1: Tokenize the fund names
tokenized_funds = [fund.split() for fund in fund_list]
# Step 2: Count the frequency of each word in the fund names
all_words = [word for sublist in tokenized_funds for word in sublist]
word_counts = Counter(all_words)
if not common_words:
# Step 3: Filter out words that appear in at least 70% of the fund names
threshold = 0.7 * len(fund_list)
common_words = {word for word, count in word_counts.items() if count >= threshold}
common_words = list(common_words)
# Step 4: Remove the common words from each fund name
filtered_funds = []
for fund in fund_list:
# Split the fund name into words and remove common words
filtered_fund = ' '.join([word for word in fund.split() if word not in common_words])
# If removing common words leaves the name empty, retain the original name
if filtered_fund.strip() == '':
filtered_funds.append(fund)
else:
filtered_funds.append(filtered_fund)
else:
filtered_funds = fund_list
return filtered_funds, common_words
### STEP 5 - LLM with Provider
prompt_instruction = """
### Task Overview:
You will be given data in the form of `provider_name` (string), `prediction_fund` (list of strings), and `true_fund` (list of strings). Your task is to match each fund from the `prediction_fund` list to the correct entry in the `true_fund` list. The final output should be a JSON where the keys are funds from `prediction_fund` and the values are the matching funds from `true_fund` or an empty string `""` if no match is found.
### Instructions:
1. Provider Name Handling:
If the same word (like the provider name) appears across multiple `true_fund` entries, it is likely part of the provider's name. In this case, ignore such common words while performing the matching.
Example:
- Input:
`provider_name`: 'Betashares'
`prediction_fund`:
[
"AUS 200",
"AUS CREDIT",
"AUS SUSTAINABLE",
"GLOBAL QUALITY",
"GLOBAL SUSTAINABLE"
]
`true_fund`:
[
"Betashares Australian Sustainability Leaders Fund",
"Betashares Australia 200 Fund",
"Betashares Global Quality Leaders Fund",
"Betashares Australian Investment Grade Corporate Bond Fund",
"Betashares Global Sustainability Leaders Fund"
]
- Output:
```json
{
"AUS 200": "Betashares Australia 200 Fund",
"AUS CREDIT": "",
"AUS SUSTAINABLE": "Betashares Australian Sustainability Leaders Fund",
"GLOBAL QUALITY": "Betashares Global Quality Leaders Fund",
"GLOBAL SUSTAINABLE": "Betashares Global Sustainability Leaders Fund"
}
```
2. Abbreviation Handling:
Some `prediction_fund` entries may use abbreviations or short forms (e.g., "AUS" for "Australia"). Identify and handle these cases by using context from both `prediction_fund` and `true_fund` lists, as shown in the example above. Match abbreviations to the expanded terms where applicable.
3. No Match Cases:
If you cannot find a suitable match for a fund from `prediction_fund` in `true_fund`, leave the match blank by assigning an empty string `""` to that entry. If you are unsure about the correct match, do not make incorrect assumptions leave it blank.
4. Duplicate Mapping Prevention:
Ensure that each true_fund name maps to only one entry in prediction_fund to avoid duplicate mappings. If multiple prediction_fund names appear to match the same true_fund name, perform a detailed word-by-word analysis to determine the closest match based on content and context. Only map one prediction_fund name to each true_fund name, and if no strong match is found, leave it blank (""). Avoid making assumptions if clarity is lacking.
### Example Input and Output:
- Sample 1:
- Input:
`provider_name`: 'ANZ'
`prediction_fund`:
[
"Cash Fund",
"Conservative Fund",
"Conservative Balanced Fund",
"Balanced Fund"
]
`true_fund`:
[
"ANZ KiwiSaver High Growth Fund",
"ANZ KiwiSaver Conservative",
"ANZ KiwiSaver Conservative Balanced",
"ANZ KiwiSaver Balanced Growth",
"ANZ KiwiSaver Growth",
"ANZ KiwiSaver Cash"
]
- Output:
```json
{
"Cash Fund": "ANZ KiwiSaver Cash",
"Conservative Fund": "ANZ KiwiSaver Conservative",
"Conservative Balanced Fund": "ANZ KiwiSaver Conservative Balanced",
"Balanced Fund": ""
}
```
- Sample 2:
- Input:
`provider_name`: 'iShare'
`prediction_fund`:
[
"iShares Wholesale Screened International Equity Index Fund (Class E Units)",
"iShares Wholesale Australian Bond Index Fund (Class E Units)",
"iShares ESG Australian Bond Index Fund (Class E Units)",
"iShares Wholesale Australian Equity Index Fund (Class E Units)",
"iShares Wholesale Australian Listed Property Index Fund (Class E Units)",
"iShares Global Listed Property Index Fund (Hedged Class E Units)",
"iShares Wholesale International Equity Index Fund (Class E Units)",
"iShares Hedged International Equity Index Fund (Class E Units)",
"iShares ESG Global Bond Index Fund (Class E Units)",
"iShares Global Bond Index Fund (Class E Units)"
]
`true_fund`:
[
"iShares Wholesale Indexed Australian Bond Fund",
"iShares Global Bond Index Fund",
"iShares Australian Listed Property Index Fund",
"iShares Emerging Markets IMI Equity Index Fund",
"iShares International Equity Index (Hgd)",
"iShares Wholesale Australian Equity Index Fund",
"iShares Screened Wholesale International Equity Index Fund"
]
- Output:
```json
{
"iShares Wholesale Screened International Equity Index Fund (Class E Units)": "iShares Screened Wholesale International Equity Index Fund",
"iShares Wholesale Australian Bond Index Fund (Class E Units)": "iShares Wholesale Indexed Australian Bond Fund",
"iShares ESG Australian Bond Index Fund (Class E Units)": "",
"iShares Wholesale Australian Equity Index Fund (Class E Units)": "iShares Wholesale Australian Equity Index Fund",
"iShares Wholesale Australian Listed Property Index Fund (Class E Units)": "iShares Australian Listed Property Index Fund",
"iShares Global Listed Property Index Fund (Hedged Class E Units)": "",
"iShares Wholesale International Equity Index Fund (Class E Units)": "",
"iShares Hedged International Equity Index Fund (Class E Units)": "iShares International Equity Index (Hgd)",
"iShares ESG Global Bond Index Fund (Class E Units)": "",
"iShares Global Bond Index Fund (Class E Units)": "iShares Global Bond Index Fund"
}
```
- Sample 3:
- Input:
`provider_name`: 'Coolabah Capital Investments'
`prediction_fund`:
[
"Coolabah Short Term Income PIE Fund",
"Coolabah Long-Short Credit PIE Fund"
]
`true_fund`:
[
"Coolabah Long-Short Credit PIE Fund",
"Coolabah Short Term Income PIE Fund"
]
- Output:
```json
{
"Coolabah Short Term Income PIE Fund": "Coolabah Short Term Income PIE Fund",
"Coolabah Long-Short Credit PIE Fund": "Coolabah Long-Short Credit PIE Fund"
}
```
Context:
"""
system_prompt = "You are helpful AI Data Analyst which helps to identify the data to get the information correctly. Read instruction carefully and provide the information accordingly into json format only."
parameters = {
"temperature": 0,
"max_tokens": 1000,
}
### Similarity methods
cosine_threshold = 0.9
levenshtien_threshold = 0.98
jaccard_thresold = 0.95
def get_cosine_score(fund_list, pred_fund_name):
matched_result = {}
matched_index = 0
for fund_db_name in fund_list:
score = get_cosine_similarity(pred_fund_name, fund_db_name)
matched_result.update({fund_db_name:score})
if len(matched_result)>0:
max_key = max(matched_result, key=matched_result.get)
matched_index = list(matched_result.keys()).index(max_key)
matched_result = {max_key: matched_result[max_key]}
return matched_result, matched_index
def get_jaccard_score(fund_list, pred_fund_name):
matched_result = {}
matched_index = 0
for fund_db_name in fund_list:
score = get_jaccard_similarity(pred_fund_name, fund_db_name)
matched_result.update({fund_db_name:score})
if len(matched_result)>0:
max_key = max(matched_result, key=matched_result.get)
matched_index = list(matched_result.keys()).index(max_key)
matched_result = {max_key: matched_result[max_key]}
return matched_result, matched_index
def get_levenshtien_score(fund_list, pred_fund_name):
matched_result = {}
matched_index = 0
for fund_db_name in fund_list:
score = get_levenshtien_distance_score(pred_fund_name, fund_db_name)
matched_result.update({fund_db_name:score})
if len(matched_result)>0:
max_key = max(matched_result, key=matched_result.get)
matched_index = list(matched_result.keys()).index(max_key)
matched_result = {max_key: matched_result[max_key]}
return matched_result, matched_index
def get_fund_match_final_score(fund_list, pred_fund_name):
cosine_score_ = ""
jaccard_score_ = ""
levenstein_score_ = ""
cosine_value_name_ = ""
jaccard_value_name_ = ""
levenstein_value_name_ = ""
# print("-> get_fund_match_final_score: ", fund_list, pred_fund_name)
# Get scores and matched indices for each similarity metric
cosine_fund_score, cosine_matched_index = get_cosine_score(fund_list, pred_fund_name)
# print("cosine_fund_score, cosine_matched_index: ", cosine_fund_score, cosine_matched_index)
jaccard_fund_score, jaccard_matched_index = get_jaccard_score(fund_list, pred_fund_name)
# print("jaccard_fund_score, jaccard_matched_index: ", jaccard_fund_score, jaccard_matched_index)
levenshtien_fund_score, levenshtein_matched_index = get_levenshtien_score(fund_list, pred_fund_name)
# print("levenshtien_fund_score, levenshtein_matched_index: ", levenshtien_fund_score, levenshtein_matched_index)
final_result = {}
matched_index = 0
# Calculate the cosine score
if cosine_fund_score:
cosine_score_ = list(cosine_fund_score.values())[0]
cosine_value_name_ = list(cosine_fund_score.keys())[0]
if cosine_score_ >= cosine_threshold:
final_result = cosine_fund_score
matched_index = cosine_matched_index
# Calculate the jaccard score
if jaccard_fund_score:
jaccard_score_ = list(jaccard_fund_score.values())[0]
jaccard_value_name_ = list(jaccard_fund_score.keys())[0]
if jaccard_score_ >= jaccard_thresold and not final_result:
final_result = jaccard_fund_score
matched_index = jaccard_matched_index
# Calculate the levenshtein score
if levenshtien_fund_score:
levenstein_score_ = list(levenshtien_fund_score.values())[0]
levenstein_value_name_ = list(levenshtien_fund_score.keys())[0]
if levenstein_score_ >= levenshtien_threshold and not final_result:
final_result = levenshtien_fund_score
matched_index = levenshtein_matched_index
# Collect all scores, defaulting to the highest available match if all are equal
all_scores_ = [cosine_score_, jaccard_score_, levenstein_score_]
all_prediction_names_ = [cosine_value_name_, jaccard_value_name_, levenstein_value_name_]
return final_result, matched_index, all_scores_, all_prediction_names_
### Format Response
def format_response(doc_id, pred_fund, db_fund, clean_pred_name, clean_db_name,
step0_pred_name=None, step0_db_name=None,
step0_matched_db_name_cosine = None, step0_matched_db_name_jacc = None, step0_matched_db_name_leven = None,
step0_cosine=None, step0_jaccard=None, step0_levenshtein=None,
step1_pred_name=None, step1_db_name=None,
step1_matched_db_name_cosine = None, step1_matched_db_name_jacc = None, step1_matched_db_name_leven = None,
step1_cosine=None, step1_jaccard=None, step1_levenshtein=None,
step2_pred_name=None, step2_db_name=None,
step2_matched_db_name_cosine = None, step2_matched_db_name_jacc = None, step2_matched_db_name_leven = None,
step2_cosine=None, step2_jaccard=None, step2_levenshtein=None,
step3_pred_name=None, step3_db_name=None,
step3_matched_db_name_cosine = None, step3_matched_db_name_jacc = None, step3_matched_db_name_leven = None,
step3_cosine=None, step3_jaccard=None, step3_levenshtein=None,
step4_pred_name=None, step4_db_name=None,
step4_matched_db_name_cosine = None, step4_matched_db_name_jacc = None, step4_matched_db_name_leven = None,
step4_cosine=None, step4_jaccard=None, step4_levenshtein=None,
llm_flag=None,llm_clean_pred_list=None, llm_clean_db_list=None, llm_pred_fund=None, llm_matched_db_name=None, llm_result=None):
dt = {
'doc_id': doc_id,
'pred_fund': pred_fund,
'db_fund': db_fund,
'cleaned_pred_fund_name': clean_pred_name,
'cleaned_db_fund_name': clean_db_name,
'step0_pred_name': step0_pred_name,
'step0_db_name': step0_db_name,
'step0_matched_db_name_cosine': step0_matched_db_name_cosine,
'step0_matched_db_name_jacc': step0_matched_db_name_jacc,
'step0_matched_db_name_levenstn': step0_matched_db_name_leven,
'step0_cosine': step0_cosine,
'step0_jaccard': step0_jaccard,
'step0_levenshtein': step0_levenshtein,
'step1_pred_name': step1_pred_name,
'step1_db_name': step1_db_name,
'step1_matched_db_name_cosine': step1_matched_db_name_cosine,
'step1_matched_db_name_jacc': step1_matched_db_name_jacc,
'step1_matched_db_name_levenstn': step1_matched_db_name_leven,
'step1_cosine': step1_cosine,
'step1_jaccard': step1_jaccard,
'step1_levenshtein': step1_levenshtein,
'step2_pred_name': step2_pred_name,
'step2_db_name': step2_db_name,
'step2_matched_db_name_cosine': step2_matched_db_name_cosine,
'step2_matched_db_name_jacc': step2_matched_db_name_jacc,
'step2_matched_db_name_levenstn': step2_matched_db_name_leven,
'step2_cosine': step2_cosine,
'step2_jaccard': step2_jaccard,
'step2_levenshtein': step2_levenshtein,
'step3_pred_name': step3_pred_name,
'step3_db_name': step3_db_name,
'step3_matched_db_name_cosine': step3_matched_db_name_cosine,
'step3_matched_db_name_jacc': step3_matched_db_name_jacc,
'step3_matched_db_name_levenstn': step3_matched_db_name_leven,
'step3_cosine': step3_cosine,
'step3_jaccard': step3_jaccard,
'step3_levenshtein': step3_levenshtein,
'step4_pred_name': step4_pred_name,
'step4_db_name': step4_db_name,
'step4_matched_db_name_cosine': step4_matched_db_name_cosine,
'step4_matched_db_name_jacc': step4_matched_db_name_jacc,
'step4_matched_db_name_levenstn': step4_matched_db_name_leven,
'step4_cosine': step4_cosine,
'step4_jaccard': step4_jaccard,
'step4_levenshtein': step4_levenshtein,
'llm_flag': llm_flag,
'llm_clean_pred_list': llm_clean_pred_list,
'llm_clean_db_list': llm_clean_db_list,
'llm_pred_fund': llm_pred_fund,
'llm_matched_db_name': llm_matched_db_name,
'llm_result': llm_result
}
return dt
def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_source: str = "aus_prospectus"):
final_result = {}
df_data = []
unmatched_pred_list = pred_list.copy()
unmatched_db_list = db_list.copy()
for index, pred_fund in enumerate(pred_list):
# print("\n -->> pred_fund: ",pred_fund, index)
try:
### STEP-0 RAW Test
raw_result, matched_index, all_scores_, all_matched_fund_names_ = get_fund_match_final_score(db_list, pred_fund)
# print("RAW STEP: ",raw_result)
if len(raw_result)>0:
final_result.update({pred_list[index]: db_list[matched_index]})
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], pred_fund, list(raw_result.keys())[0],
step0_pred_name=pred_fund, step0_db_name=db_list,
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
llm_flag=False))
2025-01-21 23:24:05 +00:00
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
### STEP-1 Abbreviation replacement
cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund], doc_source)[0]
cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list, doc_source)
# print("--> ",cleaned_db_list1, cleaned_pred_name1)
step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1)
# print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}")
# print(f"Cleaned Pred Name: {cleaned_pred_name1, cleaned_db_list1}")
# print(f"Matched Index: {matched_index}, All Scores: {all_scores1_}, All Matched Fund Names: {all_matched_fund_names1_}")
if len(step1_result)>0:
final_result.update({pred_list[index]: db_list[matched_index]})
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name1, list(step1_result.keys())[0],
step0_pred_name=pred_fund, step0_db_name=db_list,
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False))
2025-01-21 23:24:05 +00:00
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
### STEP-2 Remove Stopwords
cleaned_pred_name2 = remove_stopwords_nltk([cleaned_pred_name1])[0]
cleaned_db_list2 = remove_stopwords_nltk(cleaned_db_list1)
# print("--> ",cleaned_db_list2, cleaned_pred_name2)
step2_result, matched_index, all_scores2_, all_matched_fund_names2_ = get_fund_match_final_score(cleaned_db_list2, cleaned_pred_name2)
# print(f"\nStep 2 - Remove Stopwords Result: {step2_result}")
# print(f"Cleaned Pred Name: {cleaned_pred_name2, cleaned_db_list2}")
# print(f"Matched Index: {matched_index}, All Scores: {all_scores2_}, All Matched Fund Names: {all_matched_fund_names2_}")
if len(step2_result)>0:
final_result.update({pred_list[index]: db_list[matched_index]})
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name2, list(step2_result.keys())[0],
step0_pred_name=pred_fund, step0_db_name=db_list,
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2],
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False))
2025-01-21 23:24:05 +00:00
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
### STEP-3 Special Character Removal
cleaned_pred_name3 = remove_special_characters([cleaned_pred_name2])[0]
cleaned_db_list3 = remove_special_characters(cleaned_db_list2)
# print("--> ",cleaned_db_list3, cleaned_pred_name3)
step3_result, matched_index, all_scores3_, all_matched_fund_names3_ = get_fund_match_final_score(cleaned_db_list3, cleaned_pred_name3)
# print(f"\nStep 3 - Special Character Removal Result: {step3_result}")
# print(f"Cleaned Pred Name: {cleaned_pred_name3, cleaned_db_list3}")
# print(f"Matched Index: {matched_index}, All Scores: {all_scores3_}, All Matched Fund Names: {all_matched_fund_names3_}")
if len(step3_result)>0:
final_result.update({pred_list[index]: db_list[matched_index]})
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name3, list(step3_result.keys())[0], step0_pred_name=pred_fund, step0_db_name=db_list,
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2],
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],
step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False))
2025-01-21 23:24:05 +00:00
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
### STEP-4 Common Words Removal
cleaned_db_list4, _ = remove_common_words(cleaned_db_list3)
# print("cleaned_db_list4 : ",cleaned_db_list4)
cleaned_pred_list, _ = remove_common_words(pred_list)
cleaned_pred_name4 = cleaned_pred_list[index]
# print("cleaned_pred_name4: ",cleaned_pred_name4)
step4_result, matched_index, all_scores4_, all_matched_fund_names4_ = get_fund_match_final_score(cleaned_db_list4, cleaned_pred_name4)
# print(f"\nStep 4 - Common Words Removal Result: {step4_result}")
# print(f"Cleaned Pred Name: {cleaned_pred_name4, cleaned_db_list4}")
# print(f"Matched Index: {matched_index}, All Scores: {all_scores4_}, All Matched Fund Names: {all_matched_fund_names4_}")
if len(step4_result)>0:
final_result.update({pred_list[index]: db_list[matched_index]})
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4,
list(step4_result.keys())[0],
step0_pred_name=pred_fund, step0_db_name=db_list,
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2],
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],
step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],
step4_pred_name=cleaned_pred_name4, step4_db_name=cleaned_db_list4,
step4_matched_db_name_cosine= all_matched_fund_names4_[0], step4_matched_db_name_jacc= all_matched_fund_names4_[1], step4_matched_db_name_leven= all_matched_fund_names4_[2],
step4_cosine=all_scores4_[0], step4_jaccard=all_scores4_[1], step4_levenshtein=all_scores4_[2],
llm_flag=False))
# print("unmatched_db_list: ",unmatched_db_list)
# print("unmatched_pred_list: ",unmatched_pred_list)
# print("db_list[matched_index]: ",db_list[matched_index])
# print("pred_list[index]: ",pred_list[index])
2025-01-21 23:24:05 +00:00
if db_list[matched_index] in unmatched_db_list:
unmatched_db_list.remove(db_list[matched_index])
# unmatched_db_list.remove(db_list[matched_index])
if pred_list[index] in unmatched_pred_list:
unmatched_pred_list.remove(pred_list[index])
# unmatched_pred_list.remove(pred_list[index])
else:
df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4,
db_list[matched_index],
step0_pred_name=pred_fund, step0_db_name=db_list,
step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2],
step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2],
step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1,
step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2],
step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2],
step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2,
step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2],
step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],
step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3,
step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2],
step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],
step4_pred_name=cleaned_pred_name4, step4_db_name=cleaned_db_list4,
step4_matched_db_name_cosine= all_matched_fund_names4_[0], step4_matched_db_name_jacc= all_matched_fund_names4_[1], step4_matched_db_name_leven= all_matched_fund_names4_[2],
step4_cosine=all_scores4_[0], step4_jaccard=all_scores4_[1], step4_levenshtein=all_scores4_[2],
llm_flag=True))
except Exception as e:
print("Error: ",e)
# print("==>>> DB LIST: ",unmatched_db_list)
# print("==>>> PRED LIST: ",unmatched_pred_list)
if len(unmatched_pred_list)!=0:
cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list, doc_source)
cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list)
cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list)
cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list, doc_source)
cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list)
cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list)
prompt_context = f"""
{prompt_instruction}
provider_name: {provider_name}
prediction_fund:
{cleaned_unmatched_pred_list}
true_fund:
{cleaned_unmatched_db_list}
"""
# print(f"\ncleaned_unmatched_pred_list: ",cleaned_unmatched_pred_list)
# print(f"cleaned_unmatched_db_list: ",cleaned_unmatched_db_list)
# llm_response = get_llm_response(prompt_context)
llm_response, with_error = chat(
prompt=prompt_context, system_prompt=system_prompt, response_format={"type": "json_object"}
)
# logger.info(f"fund matching LLM Response: {llm_response}")
if 'response' in llm_response.keys():
try:
llm_result = json.loads(llm_response['response'])
except:
try:
llm_result = json_repair.loads(llm_response['response'])
except:
llm_result = {}
# try:
# llm_result = ast.literal_eval(llm_response['response'].replace('\n',''))
# except Exception as e:
# logger.info(f"error: {e}")
# cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '')
# llm_result = json.loads(cleaned_response)
# logger.info(f"\n\n llm_result: {llm_result}")
for k,v in llm_result.items():
# print("k: ",k)
# print("v: ",v)
og_db_index=-1
og_pred_index = -1
if k in cleaned_unmatched_pred_list:
og_pred_index = cleaned_unmatched_pred_list.index(k)
if og_pred_index == -1:
# sometimes, the raw name and db name reversed from the LLM response
if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list:
og_pred_index = cleaned_unmatched_pred_list.index(v)
og_db_index = cleaned_unmatched_db_list.index(k)
# v and k are swapped
temp = v
v = k
k = temp
if og_pred_index==-1:
continue
# og_db_index = cleaned_unmatched_db_list.index(v)
if og_db_index == -1 and v in cleaned_unmatched_db_list:
og_db_index = cleaned_unmatched_db_list.index(v)
# print("og_db_index: ",og_db_index, cleaned_unmatched_db_list)
# print("unmatched_db_list: ",unmatched_db_list)
for i in df_data:
if i['pred_fund']==unmatched_pred_list[og_pred_index]:
if og_db_index!=-1:
i['db_fund']=unmatched_db_list[og_db_index]
i['cleaned_db_fund_name'] = v
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
else:
i['db_fund'] = ''
i['cleaned_db_fund_name'] = ''
final_result.update({unmatched_pred_list[og_pred_index]:""})
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
i['llm_pred_fund'] = k
i['llm_matched_db_name'] = v
i['llm_result'] = llm_result
break
# break
return final_result
def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names):
result = api_response['data']
doc_fund_names = [item['fund_name'] for item in result]
db_fund_names = all_investment_db_names.split(';')
for item in result:
item['result']['matched_db_fund_name'] = ''
item['result']['doc_fund_name'] = item['fund_name']
item['result']['fund_name_matched'] = 'False'
if len(doc_fund_names)>0 and len(db_fund_names)>0:
fund_match_result = final_function_to_match(doc_id, doc_fund_names, db_fund_names, providerName)
print("fund_match results: ", fund_match_result)
for k,v in fund_match_result.items():
if v:
for item in result:
if k==item['fund_name']:
item['fund_name'] = v
item['result']['matched_db_fund_name'] = v
item['result']['doc_fund_name'] = k
item['result']['fund_name_matched'] = 'True'
api_response['data'] = result
return api_response
# pred_list = ['Bond Fund', 'California Tax Free Income Fund', 'John Hancock Bond Fund', 'John Hancock California Tax Free Income Fund', 'John Hancock California Municipal Bond Fund', 'John Hancock Esg Core Bond Fund', 'John Hancock Government Income Fund', 'John Hancock High Yield Fund', 'John Hancock High Yield Municipal Bond Fund', 'John Hancock Income Fund', 'John Hancock Investment Grade Bond Fund', 'John Hancock Municipal Opportunities Fund', 'John Hancock Sovereign Bond Fund', 'John Hancock Short Duration Bond Fund', 'John Hancock Short Duration Municipal Opportunities Fund']
# db_list = ['JHancock Bond Fund', 'JHancock CA Municipal Bond Fund', 'JHancock ESG Core Bond Fund', 'JHancock Government Income Fund', 'JHancock High Yield Fund', 'JHancock High Yield Municipal Bond Fund', 'JHancock Income Fund', 'JHancock Investment Grade Bond Fund', 'JHancock Municipal Opportunities Fund', 'JHancock Short Dur Muncpl Opps Fd', 'JHancock Short Duration Bond Fund']
# provider_name = "John Hancock"
# doc = 123
# result = final_function_to_match(doc, pred_list, db_list, provider_name)
# print("\nresult: ",result)