From b15d260a58a2e7cda18d872da2dccfe5c96d67b8 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 21 Jan 2025 16:55:08 -0600 Subject: [PATCH] migrate name mapping algorithm from Ravi --- .gitignore | 1 + .../aus_prospectus/abbreviation_records.json | 1066 +++++++++++++++++ core/auz_nz/hybrid_solution_script.py | 717 +++++++++++ core/auz_nz/readme.md | 9 + core/auz_nz/string_similarity.py | 77 ++ core/data_extraction.py | 30 +- core/data_mapping.py | 133 +- core/data_translate.py | 5 +- main.py | 29 +- requirements.txt | 4 +- utils/gpt_utils.py | 27 +- 11 files changed, 2073 insertions(+), 25 deletions(-) create mode 100644 configuration/aus_prospectus/abbreviation_records.json create mode 100644 core/auz_nz/hybrid_solution_script.py create mode 100644 core/auz_nz/readme.md create mode 100644 core/auz_nz/string_similarity.py diff --git a/.gitignore b/.gitignore index adad70e..c1f6286 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ /specific_calc_metrics.py /test_specific_biz_logic.py /drilldown_practice.py +/core/auz_nz/__pycache__/*.pyc diff --git a/configuration/aus_prospectus/abbreviation_records.json b/configuration/aus_prospectus/abbreviation_records.json new file mode 100644 index 0000000..10302cb --- /dev/null +++ b/configuration/aus_prospectus/abbreviation_records.json @@ -0,0 +1,1066 @@ +{ + "H": "High", + "(acc)": "(acc)", + "(dist)": "(dist)", + "perf": "(perf)", + "1167 Act Fds ICAV": "1167 Active Funds ICAV", + "Andln": "Aandelen", + "AB": "AB SICAV I", + "Aberdeen Global II": "Aberdeen Global II", + "Aberdeen Liqdty (Lux)": "Aberdeen Liquidity Fund (Lux)", + "Aberdeen Std": "Aberdeen Standard", + "AS SICAV I": "Aberdeen Standard SICAV I", + "ABN AMRO": "ABN AMRO Multi-Manager Funds ", + "AAMMF FoM": "ABN AMRO Multi-Manager Funds - Fund of Mandates", + "Abs": "Absolute", + "AI": "Absolute Insight", + "Acct": "Account", + "Acc": "Accumulation", + "Achrs": "Achievers", + "Actns": "Actions", + "Actv": "Active", + "Act": "Active", + "actvy": "actively", + "Actr": "Actuaries", + "Admt": "Adamant", + "Adj": "Adjustable", + "Adjs": "Adjusted", + "Admin": "Administrative", + "Adm": "Administrative", + "Advcd": "Advanced", + "Advnr": "Advancer", + "Advtg": "Advantage", + "Advts": "Advantus", + "Advnturs": "Adventurous", + "Adv": "Advisor", + "Advs": "Advisors", + "Arspc": "Aerospace", + "Afts": "Affiliates", + "Afr": "Africa", + "Agcy": "Agency", + "Aggt": "Aggregate", + "Agrsv": "Aggressive", + "AgriBsns": "AgriBusiness", + "Agril": "Agricultural", + "Agc": "Agricultural", + "Agltr": "Agriculture", + "Agr": "Agriculture", + "Agri": "Agrinvest", + "Abg": "Airbag", + "Air": "Airlines", + "Akcvy": "Akciovy", + "Akt": "Aktier", + "Albtrs": "Albatross", + "Allnc": "Alliance", + "AllncBrnstn": "AllianceBernstein", + "Allc": "Allocation", + "Allctr": "Allocator", + "Alp": "Alpha", + "Alt": "Alternative", + "Alts": "Alternatives", + "Ambt": "Ambition", + "Amer": "American", + "Am": "American", + "AFS": "Amundi Fund Solutions -", + "Amundi Fds": "Amundi Funds", + "Amundi Fds II": "Amundi Funds II -", + "Amundi IS": "Amundi Index Solutions - Amundi Index", + "Amundi Mny Mkt Fd": "Amundi Money Market Fund", + "Amundi SF": "Amundi S.F. -", + "Angl": "Angel", + "Ann": "Annual", + "ADis": "Annually Distribution", + "AD": "Annually Distribution", + "ATA": "ANTARCTICA", + "AntiBench": "Antibenchmark", + "App": "Appliances", + "Apprec": "Appreciation", + "Appr": "Approche", + "Apl": "April", + "Arbn": "Arabian", + "Arbtrg": "Arbitrage", + "Ar": "Area", + "ARM": "ARMENIA", + "Artfcl": "Artificial", + "AsiaPac": "Asia Pac\u00edfic", + "APAC": "Asia Pac\u00edfic", + "Asn": "Asian", + "Asst": "Asset", + "Ast": "Asset", + "Assts": "Assets", + "Assrd": "Assured", + "Audentia Capital": "Audentia Capital SICAV PLC", + "Ausgwn": "Ausgewogen", + "AUS": "Australian", + "Autcll": "Autocallable", + "Atmt": "Automated", + "Atmtn": "Automation", + "Avant": "Avantage", + "Avrg": "Average", + "Avg": "Average", + "Aviva Investors": "Aviva Investors", + "Awrns": "Awareness", + "AXAIMFIIS": "AXA IM Fixed Income Investment Strategies", + "AXAWF": "AXA World Funds", + "AXAWF II": "AXA World Funds II", + "Bckd": "Backed", + "Bkwrdt": "Backwardated", + "Baillie Gifford WW": "Baillie Gifford Wldwd", + "Bal": "Balanced", + "Bk": "Bank", + "Bkg": "Banking", + "BL": "Banque de Luxembourg", + "Bantleon sel": "Bantleon select", + "Bcly": "Barclays", + "Bsn": "Basin", + "Behvrl": "Behavioral", + "Bench": "Benchmark", + "Bnf": "Benefit", + "Bt": "Beta", + "Bvrg": "Beverage", + "Biotech": "Biotechnology", + "BlkRk": "BlackRock", + "BR": "BlackRock", + "Blnd": "Blended", + "Blmbrg": "Bloomberg", + "BBMSCI": "Bloomberg Barclays MSCI", + "Bl Chp": "Blue Chip", + "BNPP B Strategy": "BNP Paribas B Strategy", + "BNY": "BNY Mellon Liquidity Funds PLC", + "Bd": "Bond", + "Bds": "Bonds", + "Bnsm": "Bonusom", + "Bstr": "Booster", + "Brds": "Brands", + "BrdwnGLB": "BrandywineGLOBAL", + "BRA": "Brazil", + "Bnt": "Brent", + "Bdgw": "Bridgeway", + "Brd": "Broad", + "Bdpst": "Budapest", + "Bldr": "Builder", + "Bnd": "Bundle", + "CaixaBank Global": "CaixaBank Global SICAV", + "CA": "California", + "Cll": "Call", + "Cnd": "Canada", + "CAD": "Canadian Dollar ", + "Candriam Bds": "Candriam Bonds", + "Candriam Eqs B": "Candriam Equities B", + "Candriam Eqs L": "Candriam Equities L", + "Candriam Sustainable": "Candriam Sustainable", + "Cp": "Cap", + "Cptl": "Capital", + "CapitalatWork": "Capitalatwork Foyer Umbrella", + "Cap": "Capitalisation", + "Cpd": "Capped", + "Cps": "Caps", + "Carb": "Carbon", + "Cr": "Core", + "Carmignac Pf": "Carmignac Portfolio", + "Csh": "Cash", + "Cau": "Cautious", + "Cdl": "Cedola", + "Cntrl": "Central", + "Cntrc": "Centric", + "Crtfcts": "Certificates", + "Csky": "Cesky", + "Chg": "Change", + "Chrts": "Charities", + "CHN": "CHINA", + "Chns": "Chinese", + "CNY": "Chinese Yuan ", + "Chp": "Chip", + "Chc": "Choice", + "Cts": "Cities", + "Cl": "Classic", + "Clmt": "Climate", + "Cls": "Close", + "Cld": "Cloud", + "Cogntv": "Cognitive", + "Coll": "Collateralized", + "Cllctn": "Collection", + "Cllctv": "Collective", + "Collect": "Collectivit\u00e9s", + "Col": "COLOMBIA", + "Colord": "Colorado", + "Cmfrt": "Comfort", + "Comrcl": "Commercial", + "Commercial": "Commercialization", + "Cmdts": "Commodities", + "Cmdty": "Commodity", + "Cmd": "Commodity", + "CIF": "Common Investment Fund", + "Comm": "Communication", + "Cie": "Compagnie", + "Cies": "Compagnies", + "Coms": "Companies", + "Com": "Company", + "Cmp": "Compass", + "Cmplt": "Complete", + "Compnt": "Component", + "Comps": "Composite", + "Cmprhsv": "Comprehensive", + "Comp": "Computer", + "Cmptg": "Computing", + "Concntr": "Concentrated", + "Concpt": "Concept", + "Cndtnl": "Conditional", + "Cnsrv": "Conservative", + "Cnsv": "Conservative", + "CsvtCvtb": "Conservative Convertible", + "Convr": "Conserver", + "Cons": "Consolidado", + "Cnstnt": "Constant", + "Constnd": "Constrained", + "Constr": "Constraint", + "Const": "Construction", + "Cnsmr": "Consumer", + "Continen": "Continental", + "Contnn": "Continental", + "Cntgnt": "Contingent", + "Contra": "Contrarian", + "Ctrl": "Control", + "Contrvrsl": "Controversial", + "Cont": "Controversial", + "Cnvrt": "Convertible", + "Convert": "Convertibles", + "Cnvrts": "Convertibles", + "Convex": "Convexit\u00e9", + "Convct": "Conviction", + "Convict": "Convictions", + "Coop": "Cooper", + "Cor": "Core", + "Corp": "Corporates", + "CrpBdIdx": "Corporate Bond Index", + "Countrs": "Countries", + "Cntry": "Country", + "Cpn": "Coupon", + "Cov": "Covered", + "Crct": "Creciente", + "Crdt": "Credit", + "CS": "Credit Suisse", + "CSV SIF": "Credit Suisse Virtuoso SICAV - SIF", + "Cre": "Creek", + "Crsvr": "Crossover", + "Crd": "Crude", + "Cum": "Cumulative", + "Ccis": "Currencies", + "Ccy": "Currency", + "Cust": "Custom", + "Custmzd": "Customized", + "Cycl": "Cyclicals", + "Dl": "Daily", + "Danske FoF": "Danske Fund of Funds", + "Danske Invest Allc": "Danske Invest Allocation SICAV", + "Danske Invest": "Danske Invest SICAV", + "Dt": "Date", + "Dtd": "Dated", + "db": "Db", + "db AM": "db Advisory Multibrands", + "De": "Death", + "Dbt": "Debt", + "Dfnc": "Defence", + "Dfndr": "Defender", + "Defensv": "Defensive", + "Defesv": "Defensive", + "Dfnd": "Defined", + "Dlt": "Delta", + "Delta Lloyd L": "Delta Lloyd L", + "Dmnd": "Demand", + "Dmgrphcs": "Demographics", + "Demgrph": "Demography", + "Dmgr": "Demography", + "Dnmntd": "Denominated", + "Dpsts": "Deposits", + "Dsgntd": "Designated", + "Deutlnd": "Deutschland", + "Dev": "Developed", + "Devpg": "Developing", + "Devpmt": "Development", + "Dgnstcs": "Diagnostics", + "Dgtl": "Digital", + "Dimsnl": "Dimensional", + "Drt": "Direct", + "Dir": "Direct", + "Discplnd": "Disciplined", + "Dscnt": "Discount", + "Discvs": "Discoveries", + "Discv": "Discovery", + "Discret": "Discretion", + "Disctnry": "Discretionary", + "Disc": "Discretionary", + "Disrpt": "Disruptive ", + "Dsrpt": "Disruptive ", + "Dis": "Distribution", + "Divers": "Diversified", + "Div": "Dividend", + "Dvrs": "Diversified", + "Divst": "Diversity", + "DP": "Dividend Payout", + "DR": "Dividend Reinvestment", + "Divs": "Dividends", + "Dlhps": "Dluhopisu", + "Dbyvtl": "Dobyvatelia", + "Dllr": "Dollar", + "Domst": "Domestic", + "DPAM Capital B": "DPAM Capital B", + "Drvn": "Driven", + "Durb": "Durable", + "Drbl": "Durables", + "Dur": "Duration", + "Dyn": "Dynamic", + "Erns": "Earnings", + "Estn": "Eastern", + "Eaton Vance Intl (CYM)": "Eaton Vance International (Cayman Islands)", + "Eaton Vance Intl (IRL)": "Eaton Vance International (Ireland)", + "Ecomm": "Ecommerce ", + "Eco": "Economie", + "Ecos": "Economies", + "Ecoy": "Economy", + "Edu": "Education", + "Elevation Fds (IE)": "Elevation UCITS Funds (Ireland) ICAV", + "E": "Elite", + "Emgnt": "Emergente", + "Em": "Emerging", + "Emerg": "Emerging", + "Em Mkts": "Emerging Markets", + "EM": "Emerging Markets", + "Emply": "Employee", + "Emplmt": "Employment", + "Empwrmt": "Empowerment", + "Endwmnt": "Endowment ", + "Endur": "Endurance", + "Eggm": "Eneagement", + "Engy": "Energy", + "Ey": "Energy", + "Eng": "English", + "Enh": "Enhanced", + "EnhFxIn": "Enhanced Fixed Income", + "Entrprs": "Entrepreneurs", + "Ents": "Enterprises", + "Entrepr": "Entreprendre", + "Entrprnrs": "Entrepreneurs", + "Entr": "Entreprise", + "Envir": "Environment", + "Envirtly": "Environmentally", + "Epch": "Epoch", + "Epsilon Fund": "Epsilon Fund", + "Eqlty": "Equality", + "Eqs": "Equities", + "Eq": "Equity", + "Essential Port Sel": "Essential Portfolio Selection", + "Estblshd": "Established", + "Estt": "Estate", + "Etcl": "Ethical", + "EUR": "EUR", + "\u20ac": "EUR", + "Euro": "Euro", + "EURO": "EURO", + "Eurbl": "Eurobloc", + "Eurodoll": "Eurodollar", + "Eurlnd": "Euroland", + "Eurp": "European", + "Erst": "Eurostoxx", + "Euroz": "Eurozone", + "Ev": "Event", + "Evol": "Evolutif", + "Evvg": "Evolving", + "Excld": "Excluding", + "Excl": "Exclusif", + "Exclsv": "Exclusive", + "Exm": "Exempt", + "Expc": "Expectations", + "Expts": "Expertise", + "Exptrs": "Exporters", + "Exps": "Exps", + "ext": "extend", + "Extnl": "External", + "Extr": "Extra", + "Fac": "Factor", + "Facs": "Factors", + "Flln": "Fallen", + "Fam": "Familiales", + "Fml": "Familie", + "FCP": "Fcp", + "Feb": "February", + "Fedrtd": "Federated", + "Fdr": "Founder", + "Fidelity": "Fidelity Funds", + "Fid": "Fiduciary", + "Finac": "Finance", + "Fincl": "Financial", + "Fincls": "Financials", + "Fi": "Financials", + "Fndr": "Finder", + "Fst": "First", + "Fxd": "Fixed", + "Flx": "Flex", + "Flex": "Flexible", + "Fltng": "Floating", + "Fl": "Floating", + "Flrd": "Floored", + "Foc": "Focus", + "Fcs": "Focused", + "Focused": "Focused SICAV", + "Fstg": "Forstrong", + "Fortn": "Fortnightly", + "FH Aberdeen Global": "Forvaltningshuset Aberdeen Global", + "Fssl Ful": "Fossil Fuel", + "Fndtn": "Foundation", + "Fdrs": "Founders", + "Frmlgtn": "Framlington", + "Frm": "Framlington", + "Fran": "Franchise", + "Frk Flx Er Agt Bd": "Franklin Flexible Euro Aggregate Bond", + "Frntr": "Frontier", + "Frtl": "Frontline ", + "Fd": "Fund", + "FoF": "Fund of Funds", + "Fdmtl": "Fundamental", + "Fdml": "Fundamental", + "Fds": "Funds", + "Fut": "Future", + "Futs": "Futures", + "GAM": "Gam", + "Garant": "Garantizado", + "Gtmr": "Gartmore", + "GBP": "GBP", + "\u00a3": "GBP", + "Gndr": "Gender", + "Gen": "General", + "Grmny": "Germany", + "Gestielle": "Gestielle Investment SICAV", + "Gest": "Gestion", + "Gnts": "Giants", + "Glt": "Gilts", + "Glbl": "Global", + "Glb": "Global", + "GDOF": "Global Dynamic Opportunities Fund Ltd.", + "GEI": "Global Equity Income ", + "GlInGd": "Global Investment Grade", + "Glble": "Globale", + "Glblnch": "Globalnich", + "GS": "Goldman Sachs", + "GSF II": "Goldman Sachs Funds II", + "Goodbody": "Goodbody Platform ICAV", + "Gouvrmntls": "Gouvernementales", + "Govt": "Government", + "Gvs": "Govies", + "Grd": "Grade", + "Grde": "Grande", + "Grter": "Greater", + "Grs Inc": "Gross Income", + "GI": "Gross Income", + "GP": "Gross Paying", + "Grp": "Group", + "Grwr": "Growers", + "Gr": "Growth", + "GSQrtx": "GSQuartix", + "Grtd": "Guaranteed", + "HY": "High Yield", + "Hpshr": "Hampshire", + "Hrd": "Hard", + "Hrdwr": "Hardware", + "Hdstrt": "Headstart", + "Hlth": "Health", + "Hlthcare": "Healthcare", + "Hlthcr": "Healthcare", + "Hdg": "Hedged", + "HF": "Hedged Fund", + "Hndrsn Pn": "Henderson Pan", + "Heptagon": "Heptagon Fund PLC", + "Hereford Fds": "Hereford Funds", + "Hertg": "Heritage", + "Hxvt": "Hexavest", + "Hi": "High", + "Hi Yld": "High Yield", + "Hldg": "Holding", + "HKD": "Hong Kong Dollar ", + "Hrzn": "Horizon", + "Hosptlty": "Hospitality", + "Human": "Humanisme", + "Hngrn": "Hungarian", + "Hyb": "Hybrid", + "Impct": "Impact", + "Imp": "Impact", + "Incld": "Including", + "Incl": "Including", + "Inc": "Income", + "Indep": "Independence", + "Idx": "Index", + "Idxd": "Indexed", + "Idxng": "Indexing", + "INR": "Indian Rupee ", + "Individualnh": "Individualniho", + "IDR": "Indonesian Rupiah ", + "Industr": "Industralized", + "Indstr": "Industrials", + "Inds": "Industries", + "Infl": "Inflation", + "Infor": "Information", + "Info": "Information", + "Infmd": "Informed", + "Infras": "Infrastructure", + "Initl": "Initial", + "Innovt": "Innovation", + "Innvtv": "Innovative", + "Innovtr": "Innovators", + "Insnstv": "Insensitive", + "Insim": "Insieme", + "Insgts": "Insights", + "Inst": "Institution", + "Instl": "Institutional", + "Ins": "Insurance", + "Insts": "Institutions", + "Intllgnc": "Intelligence", + "Intlgc": "Intelligence", + "Intst": "Interest", + "IntrR": "Interest Rate", + "Intmdt": "Intermediate", + "Itmt": "Intermediate", + "Intl": "International", + "Internat": "Internationales", + "Inter": "Interval", + "Intrs": "Intrinsic", + "Invrs": "Inverse", + "Inv": "Investors", + "Investec GSF": "Investec Global Strategy Fund", + "Investec SIV": "Investec Series IV", + "Invt": "Investing", + "Invmt": "Investment", + "Invm": "Investment", + "IG": "Investment Grade", + "Invmts": "Investments", + "Invms": "Investments", + "Irl": "Ireland", + "IEP": "Irish Pound ", + "Issr": "Issuer", + "ITA": "Italy", + "JPM": "JPMorgan Liquidity Funds", + "Jan": "Janvier", + "Jpn": "Japanese", + "JPY": "Japanese Yen ", + "JPM ISF II": "JPMorgan Investment Strategies Funds II", + "JPMF": "JPMorgan Portfolio Strategies Funds", + "Jmpr": "Jumper", + "Jmpstr": "Jumpstart", + "Kairos Alpha SICAV": "Kairos Alpha SICAV", + "Kairos Intl SICAV": "Kairos International SICAV", + "KBC Eq Fd": "KBC Equity Fund", + "KBC Index Fd": "KBC Index Fund", + "KBC Instl Fd": "KBC Institutional Fund ", + "KBC Master Fd": "KBC Master Fund", + "KBC Renta": "KBC Renta", + "KBC Select Immo": "KBC Select Immo", + "KH": "Kleinwort Hambros ", + "Knzvtvn": "Konzervativni", + "Krtkdbh": "Kratkodobych", + "Ldrd": "Laddered", + "Lrg": "Large", + "Lgr": "Larger", + "LA": "Latin America", + "Lattd": "Latitude", + "Ldrs": "Leaders", + "Ldrsp": "Leadership", + "Ldg": "Leading", + "Lrng": "Learning", + "Lvl": "Level", + "Lvrg": "Leverage", + "Lvrgd": "Leveraged", + "LGIP": "LGIP Funds (Lux)", + "Lf": "Life", + "Lfstyl": "Lifestyle", + "Ltd": "Limited", + "Lnkd": "Linked", + "Liqd": "Liquid", + "Liq": "Liquidez", + "Liqdty": "Liquidity", + "Lqdty": "Liquidity", + "Lstd": "Listed", + "Lvstk": "Livestock", + "Lvg": "Living", + "Ln": "Loan", + "Lns": "Loans", + "Lcl": "Local", + "LO Funds III": "Lombard Odier Funds III", + "LO Selection": "Lombard Odier Selection", + "LSE": "London Stock Exchange", + "Lng": "Long", + "Lg": "Long", + "L S": "Long Short", + "L/S": "Long/Short", + "L/T": "Long Term", + "L-S": "Long-Short", + "Lw": "Low", + "LRWgt": "Low Risk Weighted", + "LUX": "Luxembourg", + "Lyn": "Lynch", + "Lyxor Invmt Fds": "Lyxor Investment Funds", + "Lyxor Newcits II Plc": "Lyxor Newcits II Plc", + "Macq": "Macquarie", + "Mac": "Macro", + "Mntn": "Maintain", + "MAS": "Malaysia", + "Mgd": "Managed", + "Mgmt": "Management", + "Mgr": "Manager", + "Mgrs": "Managers", + "Manu": "Manulife", + "Manulife GF": "Manulife Global Fund", + "Mkt": "Market", + "MN": "Market Neutral", + "MNP": "Market Neutral Portfolio", + "Mkts": "Markets", + "Mkwd": "Marketwide", + "Mtr": "Master", + "Mstr": "Masters", + "Machg": "Matching", + "Matrls": "Materials", + "Mat": "Maturity", + "Mxmsr": "Maximiser", + "Med": "Mediterranean", + "Mdm": "Medium", + "M/T": "Medium Term", + "Mgtrnd": "Megatrend", + "Mrg": "Merger", + "Mrl": "Merrill", + "Mtl": "Metal", + "Mtls": "Metals", + "MFS Inv": "MFS\u00ae Investment Funds", + "MFS Meridian": "MFS\u00ae Meridian Funds", + "Md": "Mid", + "Mdl": "Middle", + "Mnrs": "Miners", + "Min": "Minimum", + "Mng": "Mining", + "MnRsk": "MinRisk", + "Mirabaud": "Mirabaud Luxembourg SIF", + "Mitlnd": "Mittelstand", + "Mod": "Moderate", + "Mdfd": "Modified", + "Momt": "Momentum", + "Mny": "Money", + "Mth": "Month", + "Mn": "Monthly", + "Mthly": "Monthly", + "Mly": "Monthly", + "MDis": "Monthly Distribution", + "MD": "Monthly Distribution", + "Mnmnt": "Monument", + "Mt": "Monument", + "Moorea Fd": "Moorea Fund", + "MS INVF": "Morgan Stanley Investment Funds", + "Mortg": "Mortgage", + "Mlt": "Multi", + "Multi Challenge": "Multi Challenge SICAV", + "MltAdv": "Multiadvisers", + "Mltalt": "Multialternative", + "Mltast": "Multiasset", + "Mlt-Asst": "Multi-Asset", + "MA": "Multi-Asset", + "Multicoop": "Multicooperation", + "Mltfct": "Multifactor", + "Mlt-Mgr": "Multi-Manager", + "MltOpps": "Multiopportunities SICAV", + "Multipartner": "Multipartner SICAV", + "RobecoSAM": "Multipartner SICAV - RobecoSAM", + "Mltplr": "Multiplier", + "Mltsct": "Multisector", + "MSMM": "Multi-Style, Multi-Manager SICAV Funds plc", + "MU Lux": "MULTI-UNITS LUXEMBOURG", + "Muncpl": "Municipal", + "Mut": "Mutual", + "Myfd": "MY.fund", + "Ntnl": "National", + "Natrl": "Natural", + "Nat": "Naturelles", + "Nbg Bm": "Neuberger Berman", + "Nflz": "Neuflize", + "Netrl": "Neutral", + "New Capital": "New Capital Fund Lux", + "Nwtn": "Newton", + "NN (B) Invest": "NN (B) Invest", + "NN (L) Intl": "NN (L) International", + "NN (L) Pat": "NN (L) Patrimonial", + "NVt": "Non-Voting", + "Nordea 1 -": "Nordea 1 -", + "Nrm": "Normal", + "Nrth": "North", + "NT": "Northern Trust Ucits Common Contractual Fund", + "Nor": "NORWAY", + "NOK": "Norwegian Krone ", + "Nov": "November", + "Nvych": "Novych", + "O\u2019Sh": "O\u2019Shaughnessy", + "Oblig": "Obligatie", + "Obl": "Obligationer", + "Oct": "Octobre", + "Off": "Offensiv", + "Offsh": "Offshore", + "Op": "Open", + "Oppc": "Opportunistic", + "Opports": "Opportunities", + "Opps": "Opportunities", + "Opp": "Opportunity", + "Optm": "Optimum", + "Optd": "Optimised", + "Optr": "Optimiser", + "Optmzr": "Optimizer", + "Optimum": "Optimum Fund", + "Opt": "Option", + "Ord": "Ordinary", + "Ori": "Orient", + "Oth": "Other", + "Ovrs": "Overseas", + "Ovrwrtg": "Overwriting", + "Ownshp": "Ownership", + "Pac": "Pacific", + "Ps": "Paesi", + "Prmtrc": "Parametric", + "Paty": "Parity", + "Part": "Partenaires", + "Prtly": "Partially", + "PtH": "Partially-Hedged", + "Ptcpt": "Participant", + "Partic": "Participation", + "Ptnr": "Partner", + "Ptnrs": "Partners", + "PGLI": "Partners Group Listed Investments SICAV", + "Partners Group Listed": "Partners Group Listed Investments SICAV - Listed", + "Pasv": "Passive", + "Patrim": "Patrimoine", + "Patriml": "Patrimonial", + "Py": "Pay", + "Pyt": "Payout", + "P2P": "Peer to Peer", + "Pensn": "Pension", + "Pen": "Pension", + "Perf": "Performance", + "Perfm": "Performers", + "Prdic": "Periodic", + "Prd": "Periodo", + "Perpt": "Perpetual", + "Psnl": "Personal", + "Phrm": "Pharma", + "Phrmctls": "Pharmaceuticals", + "PI Inv": "PI Investment Funds", + "PIMCO": "PIMCO", + "PIMCO IRL": "PIMCO Funds Ireland PLC", + "PIMCO GIS": "PIMCO Funds: Global Investors Series plc", + "PIMCO Sel": "PIMCO Select Funds PLC", + "Pinr": "Pioneer", + "Pvvrv": "Pivovarov", + "Pln": "Plan", + "Pltnm": "Platinum", + "Plato IIF": "Plato Institutional Index Fund", + "Plyrs": "Players", + "plc": "plc.", + "Pl": "Pool", + "Polar Cap": "Polar Capital Funds PLC", + "Plcy": "Policy", + "Pld": "Pooled", + "Port": "Portfolio", + "Ptf": "Portfolio", + "Pstv": "Positive", + "Pwr": "Power", + "Prec": "Precious", + "PM": "Precious Metals", + "Prfrnc": "Preference ", + "Pref": "Preferred", + "Pre": "Premia", + "Prem": "Premier", + "Prm": "Premium", + "Presv": "Preservation", + "Prstg": "Prestige", + "Prc": "Price", + "Prcng": "Pricing", + "Prlztst": "Prilezitosti", + "Pr": "Prime", + "Princ": "Principal", + "Principal": "Principal Global Investors Funds", + "Priv": "Private", + "PBFI": "Private Bank Funds I", + "Privl": "Privilege", + "Prcss": "Process", + "Prod": "Products", + "Prfl": "Profile", + "Prog": "Progressif", + "Prgrv": "Progressive", + "Prpty": "Property", + "Protec": "Protecci\u00f3n", + "Protd": "protected", + "Prot": "Protection", + "Prvds": "Providus", + "Prdnt": "Prudente", + "Pru": "Prudential", + "PCFS": "Pure Capital Fund SICAV", + "Pure SIF SA": "Pure SICAV-SIF S.A.", + "PtWrt": "PutWrite", + "PW": "PutWrite", + "Qual": "Quality", + "Qul": "Quality", + "Qntmtl": "Quantamental", + "Quant": "Quantitative", + "Quants": "Quantitatives", + "Qt": "Quarterly", + "QDis": "Quarterly Distribution", + "QD": "Quarterly Distribution", + "Quoniam Fds Sel": "Quoniam Funds Selection SICAV", + "RAMS": "RAMS Investment Unit Trust", + "Rt": "Return", + "Rts": "Reuters", + "Rl": "Real", + "RE": "Real Estate", + "Rl Rt": "Real Return", + "Rms": "Reams", + "Rsnbl": "Reasonable", + "Rebal": "Rebalance", + "Rcvy": "Recovery", + "Red Arc Glb Invms": "Red Arc Global Investments (Ireland) ICAV", + "Rgnl": "Regional", + "Rglr": "Regular", + "Reg": "Regular", + "Relatv": "Relative", + "Rlx": "Relax", + "Rendim": "Rendimiento", + "Rnt": "Renta", + "RF": "Renta Fija", + "Renta": "Rentabilit\u00e9", + "Rsrch": "Research", + "Rsh": "Research", + "Rsrv": "Reserves", + "Res": "Resources", + "responsibility": "responsAbility SICAV (Lux)", + "Rspnb": "Responsible", + "Resrs": "Ressources", + "Restrc": "Restricted", + "Rstrcng": "Restructuring", + "Retl": "Retail", + "Ret": "Return", + "Retrs": "Reuters", + "Rev": "Revenue", + "Rvvl": "Revival", + "Revolt": "Revolution", + "Rsg": "Rising", + "Rsk": "Risk", + "Rd": "Road", + "Rds": "Roads", + "Rbtc": "Robotics", + "Rdn": "Rodina", + "RLBF II": "Royal London Bond Funds II ICVC", + "RCCF": "Russell Common Contractual Fund", + "RIQIC Fund plc": "Russell Investments Qualifying Investor China Fund plc", + "S&P": "S&p", + "Sat": "Satellites", + "Satsftn": "Satisfaction", + "Svg": "Saving", + "Schroder AS": "Schroder Alternative Solutions", + "Schroder GAIA": "Schroder GAIA", + "Schroder GAIA II": "Schroder GAIA II", + "Schroder ISF": "Schroder International Selection Fund", + "Schroder Invmt Fd": "Schroder Investment Fund", + "Schroder Sel": "Schroder Selection", + "Schroder SMBC Glb Bd": "Schroder SMBC Global Bond Series", + "Schroder SSF": "Schroder Special Situations Fund", + "Sci": "Scientific", + "Scintfc": "Scientific", + "Scrd": "Scored", + "Scrn": "Screened", + "Sect": "Sectors", + "Secu": "Secure", + "Secs": "Securities", + "Scs": "Securities", + "Sctsd": "Securitised", + "Sctzd": "Securitized", + "Sec": "Security", + "Sgrgtd": "Segragated", + "SEI GAF": "SEI Global Assets Fund plc - The SEI", + "SEI GIF": "SEI Global Investments Fund Plc - The SEI", + "SEI GMF ": "SEI Global Master Fund plc - The SEI", + "Selec": "Selecci\u00f3n", + "Sel": "Selectis", + "Slctv": "Selective", + "Sdis": "Semi-annual Distribution", + "SD": "Semi-annual Distribution", + "Sr": "Senior", + "Ser": "Series", + "Svc": "Service", + "Svd Plfm": "Serviced Platform SICAV", + "Svcs": "Services", + "Shckltn": "Shackleton", + "SSE": "Shanghai Stock Exchange", + "Shr": "Share", + "Shld": "Shareholder", + "Shrs": "Shares", + "Shrh": "Shariah ", + "ShelteR Invest": "ShelteR Invest", + "Shrt": "Short", + "Short Dur": "Short Duration", + "Shrt Dur": "Short Duration", + "S/T": "Short-Term", + "SICAV": "sicav", + "Smplty": "Simplicity", + "SGD": "Singapore Dollar ", + "Sits": "Situations", + "Sivek": "Sivek", + "Skyline": "Skyline Umbrella Fund ICAV", + "Slv": "Sleeve", + "Sm": "Small", + "S&M": "Small & Mid", + "Sm Cp": "Small Caps", + "SmCp": "SmallCap", + "Sm-Cp": "Small-Cap", + "Smlr": "Smaller", + "Smrt": "Smart", + "SB": "Smart Beta", + "Smrtfd": "Smartfund", + "Smd": "Smid", + "Sclly": "Socially", + "ScllyAwr": "Socially Aware", + "Sftwr": "Software", + "Solid": "Solidaire", + "Sodty": "Solidarity", + "Solu": "Solutions", + "Sostnbl": "Sostenible", + "Souv": "Souverain", + "Sov": "Sovereigns", + "Svrgn": "Soverign", + "Spec": "Special", + "Spctm": "Spectrum", + "Sptlt": "Spotlight", + "Sqr": "Square", + "Stblty": "Stability", + "Stbl": "Stable", + "Std": "Standard", + "SLI": "Standard Life Investments Global SICAV", + "SLI GS II": "Standard Life Investments Global SICAV II", + "Stp": "Staples", + "Strt": "Start", + "Stt Strt": "State Street", + "Statstcl": "Statistical", + "Stpng": "Steepening", + "Stlg": "Sterling", + "Stwdsp": "Stewardship", + "Stk": "Stock", + "Stckpckr": "Stockpicker", + "Stks": "Stocks", + "Strat": "Strategy", + "Strats": "Strategies", + "Stgy": "Strategy", + "Struct": "Structured", + "Strctr": "Structures", + "Sbctnt": "Subcontinent", + "SbFd": "Sub-Fund", + "Sub": "Subsector", + "Skk ": "Sukuk ", + "Spr": "Super", + "ST Plus": "Super Trust Plus", + "Sprntnl": "Supranational", + "Srpls": "Surplus", + "Sustnby": "Sustainability", + "Sstby": "Sustainability", + "Sust": "Sustainable", + "Sst": "Sustainable", + "SEK": "Swedish Krona ", + "Sw": "Sweep", + "Swisscanto (LU) BF": "Swisscanto (LU) Bond Fund", + "Swisscanto (LU) EF": "Swisscanto (LU) Equity Fund", + "Swisscanto (LU) MMF": "Swisscanto (LU) Money Market Fund", + "Swisscanto (LU) PF": "Swisscanto (LU) Portfolio Fund", + "Switz": "Switzerland", + "Symphonia": "Symphonia Lux SICAV", + "Sntgm": "Syntagma", + "Sys": "System", + "Systmtc": "Systematic", + "Sysmc": "Systematic", + "T. Rowe Price": "T. Rowe Price Funds SICAV", + "Tact": "Tactical", + "Tailrd": "Tailored", + "Trgt": "Target", + "Tech": "Technology", + "Techs": "Technologies", + "Tchs": "Technologies", + "Tele": "Telecom", + "Telecms": "Telecommunications", + "Tmpltn": "Templeton", + "Trm": "Termine", + "Thm": "Thomson", + "Thms": "Themes", + "Thmsn": "Thomson", + "Thr Brdg Eurp": "Three Bridges Europe", + "Tilney ICAV": "Tilney Umbrella A ICAV", + "Tmng": "Timing", + "Ttl": "Total", + "TR": "Total Return", + "Trk": "Track", + "Trkr": "Tracker", + "Trdbl": "Tradable", + "Trd": "Trade", + "Trdg": "Trading", + "Trnh": "Tranche", + "Trsctn": "Transaction", + "Trans": "Transamerica", + "Transfmt": "Transformational", + "Trnsfm": "Transformational", + "Trsptn": "Transportation", + "Treas": "Treasuries", + "Trs": "Treasury", + "Trnd": "Trend", + "Trnds": "Trends", + "Trndy": "Trendy", + "Trhv": "Trhov", + "Trl": "Trials", + "Trif": "Triflex", + "Tr": "Trust", + "Trnard": "Turnaround", + "Twntyfr": "Twentyfour", + "Ttfr": "Twentyfour", + "US": "United States", + "UCITS": "Ucits", + "Ultr": "Ultra", + "Ulysses LT Funds": "Ulysses - L.T. Funds", + "Unbnd": "Unbundled", + "Uncons": "Unconstrained", + "Uncrltd": "Uncorrelated", + "Unhdgd": "Unhedged", + "UnH": "Unhedged", + "Unvsl": "Universal", + "Univ": "University", + "Unrstd": "Unrestricted", + "Unr": "Unrestricted", + "Upstm": "Upstream", + "USD": "USD", + "$": "USD", + "Utilts": "Utilities", + "Util": "Utility", + "Val": "Value", + "Valinvt": "Valueinvest", + "Var": "Variance", + "vhcl": "Vehicle", + "Active": "Vehicle", + "Vol": "Volatility", + "Volatil": "Volatility", + "Vontobel": "Vontobel Fund", + "Vyvazn": "Vyvazene", + "Wtr": "Water", + "Wlth": "Wealth", + "Wpns": "Weapons", + "Wkly": "Weekly", + "Wghd": "Weighed", + "Wtd": "Weighted", + "Wellington II SICAV": "Wellington Management Funds (Luxembourg) II SICAV", + "Wrld": "Wereld", + "Wstn": "Western", + "Wstfld": "Westfield", + "Wholsl": "Wholesale", + "Wnrs": "Winners", + "Wldwd": "Wldwd", + "WW": "Wldwd", + "Wld": "World", + "Yr": "Year", + "Yld": "Yield", + "Y": "Yield", + "Zr": "Zero", + "PLN": "Zloty" +} \ No newline at end of file diff --git a/core/auz_nz/hybrid_solution_script.py b/core/auz_nz/hybrid_solution_script.py new file mode 100644 index 0000000..8f316fd --- /dev/null +++ b/core/auz_nz/hybrid_solution_script.py @@ -0,0 +1,717 @@ +import pandas as pd +import os +import json +import json_repair +import pandas as pd +import math +import ast +from .string_similarity import get_cosine_similarity, get_jaccard_similarity, get_levenshtien_distance_score +import nltk +from nltk.corpus import stopwords +# from dotenv import load_dotenv +from collections import Counter +import re +from utils.gpt_utils import chat +from utils.logger import logger + +# gpt_call = GPTAPI() + +# Download the stopwords list if not already downloaded +nltk.download('stopwords') +import json +from openai import AzureOpenAI + +# load_dotenv() +# API_KEY = os.getenv("API_KEY") +# MODEL = os.getenv("MODEL") +# END_POINT = os.getenv("END_POINT") +# API_VERSION = os.getenv("API_VERSION") + + +### STEP 1 - Abbreviation Replacement + +ABB_JSON = dict() + +def get_abb_json(): + global ABB_JSON + with open("abbreviation_records.json", "r") as file: + # Load the JSON and convert keys to lowercase + ABB_JSON = {key.lower(): value for key, value in json.load(file).items()} + +def get_abbre_format_str(fundname): + """Replaces abbreviations in a fund name with their expanded forms.""" + # Convert fund name to lowercase while matching + f_list = fundname.lower().split() + updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list] + return " ".join(updated_doc_fname_words) + +def replace_abbrevs_in_fundnames(fund_names_list): + """Replaces abbreviations in a list of fund names.""" + return [get_abbre_format_str(fund_name) for fund_name in fund_names_list] + + +### STEP 2 - Remove Stopwords + +# Function to clean fund names using NLTK stopwords +def remove_stopwords_nltk(fund_names): + nltk_stopwords = set(stopwords.words('english')) + + # Add custom words if necessary (e.g., fund-related stopwords) + custom_stopwords = {'inc', 'fund', 'lp', 'llc', 'plc'} + final_stopwords = nltk_stopwords.union(custom_stopwords) + + def stopword_clean(fund_name, stopwords): + words = re.split(r'\W+', fund_name.lower()) + filtered_words = [word for word in words if word not in stopwords and word.strip() != ''] + cleaned_fund_name = ' '.join(filtered_words).title() # Return cleaned name in title case + return cleaned_fund_name + + cleaned_fund_names = [stopword_clean(fund, final_stopwords) for fund in fund_names] + + return cleaned_fund_names + +### STEP 3 - Special characters removal + +def remove_special_characters(fund_group): + fund_names = [re.sub(r'[^a-zA-Z0-9\s]', ' ', txt_fund).strip() for txt_fund in fund_group] + return fund_names + + +### STEP 4 - Common words removal + +def remove_common_words(fund_list, common_words=None): + if len(fund_list)>2 or common_words: + # Step 1: Tokenize the fund names + tokenized_funds = [fund.split() for fund in fund_list] + + # Step 2: Count the frequency of each word in the fund names + all_words = [word for sublist in tokenized_funds for word in sublist] + word_counts = Counter(all_words) + + if not common_words: + # Step 3: Filter out words that appear in at least 70% of the fund names + threshold = 0.7 * len(fund_list) + common_words = {word for word, count in word_counts.items() if count >= threshold} + common_words = list(common_words) + # Step 4: Remove the common words from each fund name + filtered_funds = [] + for fund in fund_list: + # Split the fund name into words and remove common words + filtered_fund = ' '.join([word for word in fund.split() if word not in common_words]) + + # If removing common words leaves the name empty, retain the original name + if filtered_fund.strip() == '': + filtered_funds.append(fund) + else: + filtered_funds.append(filtered_fund) + else: + filtered_funds = fund_list + return filtered_funds, common_words + + +### STEP 5 - LLM with Provider + +prompt_instruction = """ +### Task Overview: +You will be given data in the form of `provider_name` (string), `prediction_fund` (list of strings), and `true_fund` (list of strings). Your task is to match each fund from the `prediction_fund` list to the correct entry in the `true_fund` list. The final output should be a JSON where the keys are funds from `prediction_fund` and the values are the matching funds from `true_fund` or an empty string `""` if no match is found. + +### Instructions: +1. Provider Name Handling: + If the same word (like the provider name) appears across multiple `true_fund` entries, it is likely part of the provider's name. In this case, ignore such common words while performing the matching. + + Example: + - Input: + `provider_name`: 'Betashares' + `prediction_fund`: + [ + "AUS 200", + "AUS CREDIT", + "AUS SUSTAINABLE", + "GLOBAL QUALITY", + "GLOBAL SUSTAINABLE" + ] + `true_fund`: + [ + "Betashares Australian Sustainability Leaders Fund", + "Betashares Australia 200 Fund", + "Betashares Global Quality Leaders Fund", + "Betashares Australian Investment Grade Corporate Bond Fund", + "Betashares Global Sustainability Leaders Fund" + ] + - Output: + ```json + { + "AUS 200": "Betashares Australia 200 Fund", + "AUS CREDIT": "", + "AUS SUSTAINABLE": "Betashares Australian Sustainability Leaders Fund", + "GLOBAL QUALITY": "Betashares Global Quality Leaders Fund", + "GLOBAL SUSTAINABLE": "Betashares Global Sustainability Leaders Fund" + } + ``` + +2. Abbreviation Handling: + Some `prediction_fund` entries may use abbreviations or short forms (e.g., "AUS" for "Australia"). Identify and handle these cases by using context from both `prediction_fund` and `true_fund` lists, as shown in the example above. Match abbreviations to the expanded terms where applicable. + +3. No Match Cases: + If you cannot find a suitable match for a fund from `prediction_fund` in `true_fund`, leave the match blank by assigning an empty string `""` to that entry. If you are unsure about the correct match, do not make incorrect assumptions — leave it blank. + +4. Duplicate Mapping Prevention: + Ensure that each true_fund name maps to only one entry in prediction_fund to avoid duplicate mappings. If multiple prediction_fund names appear to match the same true_fund name, perform a detailed word-by-word analysis to determine the closest match based on content and context. Only map one prediction_fund name to each true_fund name, and if no strong match is found, leave it blank (""). Avoid making assumptions if clarity is lacking. + +### Example Input and Output: + +- Sample 1: + - Input: + `provider_name`: 'ANZ' + `prediction_fund`: + [ + "Cash Fund", + "Conservative Fund", + "Conservative Balanced Fund", + "Balanced Fund" + ] + `true_fund`: + [ + "ANZ KiwiSaver High Growth Fund", + "ANZ KiwiSaver Conservative", + "ANZ KiwiSaver Conservative Balanced", + "ANZ KiwiSaver Balanced Growth", + "ANZ KiwiSaver Growth", + "ANZ KiwiSaver Cash" + ] + - Output: + ```json + { + "Cash Fund": "ANZ KiwiSaver Cash", + "Conservative Fund": "ANZ KiwiSaver Conservative", + "Conservative Balanced Fund": "ANZ KiwiSaver Conservative Balanced", + "Balanced Fund": "" + } + ``` + +- Sample 2: + - Input: + `provider_name`: 'iShare' + `prediction_fund`: + [ + "iShares Wholesale Screened International Equity Index Fund (Class E Units)", + "iShares Wholesale Australian Bond Index Fund (Class E Units)", + "iShares ESG Australian Bond Index Fund (Class E Units)", + "iShares Wholesale Australian Equity Index Fund (Class E Units)", + "iShares Wholesale Australian Listed Property Index Fund (Class E Units)", + "iShares Global Listed Property Index Fund (Hedged Class E Units)", + "iShares Wholesale International Equity Index Fund (Class E Units)", + "iShares Hedged International Equity Index Fund (Class E Units)", + "iShares ESG Global Bond Index Fund (Class E Units)", + "iShares Global Bond Index Fund (Class E Units)" + ] + `true_fund`: + [ + "iShares Wholesale Indexed Australian Bond Fund", + "iShares Global Bond Index Fund", + "iShares Australian Listed Property Index Fund", + "iShares Emerging Markets IMI Equity Index Fund", + "iShares International Equity Index (Hgd)", + "iShares Wholesale Australian Equity Index Fund", + "iShares Screened Wholesale International Equity Index Fund" + ] + - Output: + ```json + { + "iShares Wholesale Screened International Equity Index Fund (Class E Units)": "iShares Screened Wholesale International Equity Index Fund", + "iShares Wholesale Australian Bond Index Fund (Class E Units)": "iShares Wholesale Indexed Australian Bond Fund", + "iShares ESG Australian Bond Index Fund (Class E Units)": "", + "iShares Wholesale Australian Equity Index Fund (Class E Units)": "iShares Wholesale Australian Equity Index Fund", + "iShares Wholesale Australian Listed Property Index Fund (Class E Units)": "iShares Australian Listed Property Index Fund", + "iShares Global Listed Property Index Fund (Hedged Class E Units)": "", + "iShares Wholesale International Equity Index Fund (Class E Units)": "", + "iShares Hedged International Equity Index Fund (Class E Units)": "iShares International Equity Index (Hgd)", + "iShares ESG Global Bond Index Fund (Class E Units)": "", + "iShares Global Bond Index Fund (Class E Units)": "iShares Global Bond Index Fund" + } + ``` + +- Sample 3: + - Input: + `provider_name`: 'Coolabah Capital Investments' + `prediction_fund`: + [ + "Coolabah Short Term Income PIE Fund", + "Coolabah Long-Short Credit PIE Fund" + ] + `true_fund`: + [ + "Coolabah Long-Short Credit PIE Fund", + "Coolabah Short Term Income PIE Fund" + ] + - Output: + ```json + { + "Coolabah Short Term Income PIE Fund": "Coolabah Short Term Income PIE Fund", + "Coolabah Long-Short Credit PIE Fund": "Coolabah Long-Short Credit PIE Fund" + } + ``` + +Context: + +""" + +system_prompt = "You are helpful AI Data Analyst which helps to identify the data to get the information correctly. Read instruction carefully and provide the information accordingly into json format only." + +parameters = { + "temperature": 0, + "max_tokens": 1000, + } + + +### Similarity methods + +cosine_threshold = 0.9 +levenshtien_threshold = 0.98 +jaccard_thresold = 0.95 + +def get_cosine_score(fund_list, pred_fund_name): + matched_result = {} + matched_index = 0 + for fund_db_name in fund_list: + score = get_cosine_similarity(pred_fund_name, fund_db_name) + matched_result.update({fund_db_name:score}) + if len(matched_result)>0: + max_key = max(matched_result, key=matched_result.get) + matched_index = list(matched_result.keys()).index(max_key) + matched_result = {max_key: matched_result[max_key]} + return matched_result, matched_index + +def get_jaccard_score(fund_list, pred_fund_name): + matched_result = {} + matched_index = 0 + for fund_db_name in fund_list: + score = get_jaccard_similarity(pred_fund_name, fund_db_name) + matched_result.update({fund_db_name:score}) + if len(matched_result)>0: + max_key = max(matched_result, key=matched_result.get) + matched_index = list(matched_result.keys()).index(max_key) + matched_result = {max_key: matched_result[max_key]} + return matched_result, matched_index + +def get_levenshtien_score(fund_list, pred_fund_name): + matched_result = {} + matched_index = 0 + for fund_db_name in fund_list: + score = get_levenshtien_distance_score(pred_fund_name, fund_db_name) + matched_result.update({fund_db_name:score}) + if len(matched_result)>0: + max_key = max(matched_result, key=matched_result.get) + matched_index = list(matched_result.keys()).index(max_key) + matched_result = {max_key: matched_result[max_key]} + return matched_result, matched_index + +def get_fund_match_final_score(fund_list, pred_fund_name): + cosine_score_ = "" + jaccard_score_ = "" + levenstein_score_ = "" + + cosine_value_name_ = "" + jaccard_value_name_ = "" + levenstein_value_name_ = "" + # print("-> get_fund_match_final_score: ", fund_list, pred_fund_name) + + # Get scores and matched indices for each similarity metric + cosine_fund_score, cosine_matched_index = get_cosine_score(fund_list, pred_fund_name) + # print("cosine_fund_score, cosine_matched_index: ", cosine_fund_score, cosine_matched_index) + jaccard_fund_score, jaccard_matched_index = get_jaccard_score(fund_list, pred_fund_name) + # print("jaccard_fund_score, jaccard_matched_index: ", jaccard_fund_score, jaccard_matched_index) + levenshtien_fund_score, levenshtein_matched_index = get_levenshtien_score(fund_list, pred_fund_name) + # print("levenshtien_fund_score, levenshtein_matched_index: ", levenshtien_fund_score, levenshtein_matched_index) + + final_result = {} + matched_index = 0 + + # Calculate the cosine score + if cosine_fund_score: + cosine_score_ = list(cosine_fund_score.values())[0] + cosine_value_name_ = list(cosine_fund_score.keys())[0] + if cosine_score_ >= cosine_threshold: + final_result = cosine_fund_score + matched_index = cosine_matched_index + + # Calculate the jaccard score + if jaccard_fund_score: + jaccard_score_ = list(jaccard_fund_score.values())[0] + jaccard_value_name_ = list(jaccard_fund_score.keys())[0] + if jaccard_score_ >= jaccard_thresold and not final_result: + final_result = jaccard_fund_score + matched_index = jaccard_matched_index + + # Calculate the levenshtein score + if levenshtien_fund_score: + levenstein_score_ = list(levenshtien_fund_score.values())[0] + levenstein_value_name_ = list(levenshtien_fund_score.keys())[0] + if levenstein_score_ >= levenshtien_threshold and not final_result: + final_result = levenshtien_fund_score + matched_index = levenshtein_matched_index + + # Collect all scores, defaulting to the highest available match if all are equal + all_scores_ = [cosine_score_, jaccard_score_, levenstein_score_] + all_prediction_names_ = [cosine_value_name_, jaccard_value_name_, levenstein_value_name_] + return final_result, matched_index, all_scores_, all_prediction_names_ + + +### Format Response + +def format_response(doc_id, pred_fund, db_fund, clean_pred_name, clean_db_name, + step0_pred_name=None, step0_db_name=None, + step0_matched_db_name_cosine = None, step0_matched_db_name_jacc = None, step0_matched_db_name_leven = None, + step0_cosine=None, step0_jaccard=None, step0_levenshtein=None, + step1_pred_name=None, step1_db_name=None, + step1_matched_db_name_cosine = None, step1_matched_db_name_jacc = None, step1_matched_db_name_leven = None, + step1_cosine=None, step1_jaccard=None, step1_levenshtein=None, + step2_pred_name=None, step2_db_name=None, + step2_matched_db_name_cosine = None, step2_matched_db_name_jacc = None, step2_matched_db_name_leven = None, + step2_cosine=None, step2_jaccard=None, step2_levenshtein=None, + step3_pred_name=None, step3_db_name=None, + step3_matched_db_name_cosine = None, step3_matched_db_name_jacc = None, step3_matched_db_name_leven = None, + step3_cosine=None, step3_jaccard=None, step3_levenshtein=None, + step4_pred_name=None, step4_db_name=None, + step4_matched_db_name_cosine = None, step4_matched_db_name_jacc = None, step4_matched_db_name_leven = None, + step4_cosine=None, step4_jaccard=None, step4_levenshtein=None, + llm_flag=None,llm_clean_pred_list=None, llm_clean_db_list=None, llm_pred_fund=None, llm_matched_db_name=None, llm_result=None): + dt = { + 'doc_id': doc_id, + 'pred_fund': pred_fund, + 'db_fund': db_fund, + 'cleaned_pred_fund_name': clean_pred_name, + 'cleaned_db_fund_name': clean_db_name, + + 'step0_pred_name': step0_pred_name, + 'step0_db_name': step0_db_name, + 'step0_matched_db_name_cosine': step0_matched_db_name_cosine, + 'step0_matched_db_name_jacc': step0_matched_db_name_jacc, + 'step0_matched_db_name_levenstn': step0_matched_db_name_leven, + 'step0_cosine': step0_cosine, + 'step0_jaccard': step0_jaccard, + 'step0_levenshtein': step0_levenshtein, + + 'step1_pred_name': step1_pred_name, + 'step1_db_name': step1_db_name, + 'step1_matched_db_name_cosine': step1_matched_db_name_cosine, + 'step1_matched_db_name_jacc': step1_matched_db_name_jacc, + 'step1_matched_db_name_levenstn': step1_matched_db_name_leven, + 'step1_cosine': step1_cosine, + 'step1_jaccard': step1_jaccard, + 'step1_levenshtein': step1_levenshtein, + + 'step2_pred_name': step2_pred_name, + 'step2_db_name': step2_db_name, + 'step2_matched_db_name_cosine': step2_matched_db_name_cosine, + 'step2_matched_db_name_jacc': step2_matched_db_name_jacc, + 'step2_matched_db_name_levenstn': step2_matched_db_name_leven, + 'step2_cosine': step2_cosine, + 'step2_jaccard': step2_jaccard, + 'step2_levenshtein': step2_levenshtein, + + 'step3_pred_name': step3_pred_name, + 'step3_db_name': step3_db_name, + 'step3_matched_db_name_cosine': step3_matched_db_name_cosine, + 'step3_matched_db_name_jacc': step3_matched_db_name_jacc, + 'step3_matched_db_name_levenstn': step3_matched_db_name_leven, + 'step3_cosine': step3_cosine, + 'step3_jaccard': step3_jaccard, + 'step3_levenshtein': step3_levenshtein, + + 'step4_pred_name': step4_pred_name, + 'step4_db_name': step4_db_name, + 'step4_matched_db_name_cosine': step4_matched_db_name_cosine, + 'step4_matched_db_name_jacc': step4_matched_db_name_jacc, + 'step4_matched_db_name_levenstn': step4_matched_db_name_leven, + 'step4_cosine': step4_cosine, + 'step4_jaccard': step4_jaccard, + 'step4_levenshtein': step4_levenshtein, + + 'llm_flag': llm_flag, + 'llm_clean_pred_list': llm_clean_pred_list, + 'llm_clean_db_list': llm_clean_db_list, + 'llm_pred_fund': llm_pred_fund, + 'llm_matched_db_name': llm_matched_db_name, + 'llm_result': llm_result + } + return dt + + +def final_function_to_match(doc_id, pred_list, db_list, provider_name): + final_result = {} + df_data = [] + unmatched_pred_list = pred_list.copy() + unmatched_db_list = db_list.copy() + for index, pred_fund in enumerate(pred_list): + # print("\n -->> pred_fund: ",pred_fund, index) + try: + ### STEP-0 RAW Test + raw_result, matched_index, all_scores_, all_matched_fund_names_ = get_fund_match_final_score(db_list, pred_fund) + # print("RAW STEP: ",raw_result) + if len(raw_result)>0: + final_result.update({pred_list[index]: db_list[matched_index]}) + df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], pred_fund, list(raw_result.keys())[0], + step0_pred_name=pred_fund, step0_db_name=db_list, + step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2], + step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], + llm_flag=False)) + unmatched_db_list.remove(db_list[matched_index]) + unmatched_pred_list.remove(pred_list[index]) + else: + ### STEP-1 Abbreviation replacement + cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund])[0] + cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list) + # print("--> ",cleaned_db_list1, cleaned_pred_name1) + step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1) + # print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}") + # print(f"Cleaned Pred Name: {cleaned_pred_name1, cleaned_db_list1}") + # print(f"Matched Index: {matched_index}, All Scores: {all_scores1_}, All Matched Fund Names: {all_matched_fund_names1_}") + + if len(step1_result)>0: + final_result.update({pred_list[index]: db_list[matched_index]}) + df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name1, list(step1_result.keys())[0], + step0_pred_name=pred_fund, step0_db_name=db_list, + step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2], + step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], + step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1, + step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2], + step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], llm_flag=False)) + unmatched_db_list.remove(db_list[matched_index]) + unmatched_pred_list.remove(pred_list[index]) + else: + ### STEP-2 Remove Stopwords + cleaned_pred_name2 = remove_stopwords_nltk([cleaned_pred_name1])[0] + cleaned_db_list2 = remove_stopwords_nltk(cleaned_db_list1) + # print("--> ",cleaned_db_list2, cleaned_pred_name2) + step2_result, matched_index, all_scores2_, all_matched_fund_names2_ = get_fund_match_final_score(cleaned_db_list2, cleaned_pred_name2) + # print(f"\nStep 2 - Remove Stopwords Result: {step2_result}") + # print(f"Cleaned Pred Name: {cleaned_pred_name2, cleaned_db_list2}") + # print(f"Matched Index: {matched_index}, All Scores: {all_scores2_}, All Matched Fund Names: {all_matched_fund_names2_}") + + if len(step2_result)>0: + final_result.update({pred_list[index]: db_list[matched_index]}) + df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name2, list(step2_result.keys())[0], + step0_pred_name=pred_fund, step0_db_name=db_list, + step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2], + step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], + step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1, + step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2], + step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], + step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2, + step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2], + step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2],llm_flag=False)) + unmatched_db_list.remove(db_list[matched_index]) + unmatched_pred_list.remove(pred_list[index]) + else: + ### STEP-3 Special Character Removal + cleaned_pred_name3 = remove_special_characters([cleaned_pred_name2])[0] + cleaned_db_list3 = remove_special_characters(cleaned_db_list2) + # print("--> ",cleaned_db_list3, cleaned_pred_name3) + step3_result, matched_index, all_scores3_, all_matched_fund_names3_ = get_fund_match_final_score(cleaned_db_list3, cleaned_pred_name3) + # print(f"\nStep 3 - Special Character Removal Result: {step3_result}") + # print(f"Cleaned Pred Name: {cleaned_pred_name3, cleaned_db_list3}") + # print(f"Matched Index: {matched_index}, All Scores: {all_scores3_}, All Matched Fund Names: {all_matched_fund_names3_}") + + if len(step3_result)>0: + final_result.update({pred_list[index]: db_list[matched_index]}) + df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name3, list(step3_result.keys())[0], step0_pred_name=pred_fund, step0_db_name=db_list, + step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2], + step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], + step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1, + step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2], + step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], + step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2, + step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2], + step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2], + step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3, + step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2], + step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2],llm_flag=False)) + unmatched_db_list.remove(db_list[matched_index]) + unmatched_pred_list.remove(pred_list[index]) + else: + ### STEP-4 Common Words Removal + cleaned_db_list4, _ = remove_common_words(cleaned_db_list3) + # print("cleaned_db_list4 : ",cleaned_db_list4) + cleaned_pred_list, _ = remove_common_words(pred_list) + cleaned_pred_name4 = cleaned_pred_list[index] + # print("cleaned_pred_name4: ",cleaned_pred_name4) + step4_result, matched_index, all_scores4_, all_matched_fund_names4_ = get_fund_match_final_score(cleaned_db_list4, cleaned_pred_name4) + # print(f"\nStep 4 - Common Words Removal Result: {step4_result}") + # print(f"Cleaned Pred Name: {cleaned_pred_name4, cleaned_db_list4}") + # print(f"Matched Index: {matched_index}, All Scores: {all_scores4_}, All Matched Fund Names: {all_matched_fund_names4_}") + + if len(step4_result)>0: + final_result.update({pred_list[index]: db_list[matched_index]}) + df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4, + list(step4_result.keys())[0], + step0_pred_name=pred_fund, step0_db_name=db_list, + step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2], + step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], + step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1, + step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2], + step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], + step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2, + step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2], + step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2], + step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3, + step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2], + step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2], + step4_pred_name=cleaned_pred_name4, step4_db_name=cleaned_db_list4, + step4_matched_db_name_cosine= all_matched_fund_names4_[0], step4_matched_db_name_jacc= all_matched_fund_names4_[1], step4_matched_db_name_leven= all_matched_fund_names4_[2], + step4_cosine=all_scores4_[0], step4_jaccard=all_scores4_[1], step4_levenshtein=all_scores4_[2], + llm_flag=False)) + # print("unmatched_db_list: ",unmatched_db_list) + # print("unmatched_pred_list: ",unmatched_pred_list) + # print("db_list[matched_index]: ",db_list[matched_index]) + # print("pred_list[index]: ",pred_list[index]) + unmatched_db_list.remove(db_list[matched_index]) + unmatched_pred_list.remove(pred_list[index]) + else: + df_data.append(format_response(doc_id, pred_list[index], db_list[matched_index], cleaned_pred_name4, + db_list[matched_index], + step0_pred_name=pred_fund, step0_db_name=db_list, + step0_matched_db_name_cosine= all_matched_fund_names_[0], step0_matched_db_name_jacc= all_matched_fund_names_[1], step0_matched_db_name_leven= all_matched_fund_names_[2], + step0_cosine=all_scores_[0], step0_jaccard=all_scores_[1], step0_levenshtein=all_scores_[2], + + step1_pred_name=cleaned_pred_name1, step1_db_name=cleaned_db_list1, + step1_matched_db_name_cosine= all_matched_fund_names1_[0], step1_matched_db_name_jacc= all_matched_fund_names1_[1], step1_matched_db_name_leven= all_matched_fund_names1_[2], + step1_cosine=all_scores1_[0], step1_jaccard=all_scores1_[1], step1_levenshtein=all_scores1_[2], + + step2_pred_name=cleaned_pred_name2, step2_db_name=cleaned_db_list2, + step2_matched_db_name_cosine= all_matched_fund_names2_[0], step2_matched_db_name_jacc= all_matched_fund_names2_[1], step2_matched_db_name_leven= all_matched_fund_names2_[2], + step2_cosine=all_scores2_[0], step2_jaccard=all_scores2_[1], step2_levenshtein=all_scores2_[2], + + step3_pred_name=cleaned_pred_name3, step3_db_name=cleaned_db_list3, + step3_matched_db_name_cosine= all_matched_fund_names3_[0], step3_matched_db_name_jacc= all_matched_fund_names3_[1], step3_matched_db_name_leven= all_matched_fund_names3_[2], + step3_cosine=all_scores3_[0], step3_jaccard=all_scores3_[1], step3_levenshtein=all_scores3_[2], + + step4_pred_name=cleaned_pred_name4, step4_db_name=cleaned_db_list4, + step4_matched_db_name_cosine= all_matched_fund_names4_[0], step4_matched_db_name_jacc= all_matched_fund_names4_[1], step4_matched_db_name_leven= all_matched_fund_names4_[2], + step4_cosine=all_scores4_[0], step4_jaccard=all_scores4_[1], step4_levenshtein=all_scores4_[2], + llm_flag=True)) + except Exception as e: + print("Error: ",e) + # print("==>>> DB LIST: ",unmatched_db_list) + # print("==>>> PRED LIST: ",unmatched_pred_list) + if len(unmatched_pred_list)!=0: + cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list) + cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list) + cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list) + + cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list) + cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list) + cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list) + prompt_context = f""" + {prompt_instruction} + + provider_name: {provider_name} + + prediction_fund: + {cleaned_unmatched_pred_list} + + true_fund: + {cleaned_unmatched_db_list} + """ + # print(f"\ncleaned_unmatched_pred_list: ",cleaned_unmatched_pred_list) + # print(f"cleaned_unmatched_db_list: ",cleaned_unmatched_db_list) + # llm_response = get_llm_response(prompt_context) + llm_response, with_error = chat( + prompt=prompt_context, system_prompt=system_prompt, response_format={"type": "json_object"} + ) + # logger.info(f"fund matching LLM Response: {llm_response}") + if 'response' in llm_response.keys(): + try: + llm_result = json.loads(llm_response['response']) + except: + try: + llm_result = json_repair.loads(llm_response['response']) + except: + llm_result = {} + # try: + # llm_result = ast.literal_eval(llm_response['response'].replace('\n','')) + # except Exception as e: + # logger.info(f"error: {e}") + # cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '') + # llm_result = json.loads(cleaned_response) + # logger.info(f"\n\n llm_result: {llm_result}") + for k,v in llm_result.items(): + # print("k: ",k) + # print("v: ",v) + og_db_index=-1 + og_pred_index = -1 + if k in cleaned_unmatched_pred_list: + og_pred_index = cleaned_unmatched_pred_list.index(k) + + if og_pred_index == -1: + # sometimes, the raw name and db name reversed from the LLM response + if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list: + og_pred_index = cleaned_unmatched_pred_list.index(v) + og_db_index = cleaned_unmatched_db_list.index(k) + # v and k are swapped + temp = v + v = k + k = temp + if og_pred_index==-1: + continue + # og_db_index = cleaned_unmatched_db_list.index(v) + if og_db_index == -1 and v in cleaned_unmatched_db_list: + og_db_index = cleaned_unmatched_db_list.index(v) + # print("og_db_index: ",og_db_index, cleaned_unmatched_db_list) + # print("unmatched_db_list: ",unmatched_db_list) + + for i in df_data: + if i['pred_fund']==unmatched_pred_list[og_pred_index]: + if og_db_index!=-1: + i['db_fund']=unmatched_db_list[og_db_index] + i['cleaned_db_fund_name'] = v + final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]}) + else: + i['db_fund'] = '' + i['cleaned_db_fund_name'] = '' + final_result.update({unmatched_pred_list[og_pred_index]:""}) + i['llm_clean_pred_list'] = cleaned_unmatched_pred_list + i['llm_clean_db_list'] = cleaned_unmatched_db_list, + i['llm_pred_fund'] = k + i['llm_matched_db_name'] = v + i['llm_result'] = llm_result + break + + + # break + return final_result + +def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names): + result = api_response['data'] + doc_fund_names = [item['fund_name'] for item in result] + db_fund_names = all_investment_db_names.split(';') + for item in result: + item['result']['matched_db_fund_name'] = '' + item['result']['doc_fund_name'] = item['fund_name'] + item['result']['fund_name_matched'] = 'False' + if len(doc_fund_names)>0 and len(db_fund_names)>0: + fund_match_result = final_function_to_match(doc_id, doc_fund_names, db_fund_names, providerName) + print("fund_match results: ", fund_match_result) + for k,v in fund_match_result.items(): + if v: + for item in result: + if k==item['fund_name']: + item['fund_name'] = v + item['result']['matched_db_fund_name'] = v + item['result']['doc_fund_name'] = k + item['result']['fund_name_matched'] = 'True' + + api_response['data'] = result + return api_response + + +# pred_list = ['Bond Fund', 'California Tax Free Income Fund', 'John Hancock Bond Fund', 'John Hancock California Tax Free Income Fund', 'John Hancock California Municipal Bond Fund', 'John Hancock Esg Core Bond Fund', 'John Hancock Government Income Fund', 'John Hancock High Yield Fund', 'John Hancock High Yield Municipal Bond Fund', 'John Hancock Income Fund', 'John Hancock Investment Grade Bond Fund', 'John Hancock Municipal Opportunities Fund', 'John Hancock Sovereign Bond Fund', 'John Hancock Short Duration Bond Fund', 'John Hancock Short Duration Municipal Opportunities Fund'] + +# db_list = ['JHancock Bond Fund', 'JHancock CA Municipal Bond Fund', 'JHancock ESG Core Bond Fund', 'JHancock Government Income Fund', 'JHancock High Yield Fund', 'JHancock High Yield Municipal Bond Fund', 'JHancock Income Fund', 'JHancock Investment Grade Bond Fund', 'JHancock Municipal Opportunities Fund', 'JHancock Short Dur Muncpl Opps Fd', 'JHancock Short Duration Bond Fund'] + +# provider_name = "John Hancock" +# doc = 123 + +# result = final_function_to_match(doc, pred_list, db_list, provider_name) + +# print("\nresult: ",result) \ No newline at end of file diff --git a/core/auz_nz/readme.md b/core/auz_nz/readme.md new file mode 100644 index 0000000..d99c1e3 --- /dev/null +++ b/core/auz_nz/readme.md @@ -0,0 +1,9 @@ +# Hybrid Solution Files Description + +### There are the 3 files that will be used to perform Fund Matching Task. + +- `abbreviation_records.json`: JSON files with abbreviations. + +- `string_similarity.py`: Written logics to match the strings using Cosine, Levenshtien and Jaccard. + +- `hybrid_solution_script.py`: Code for pre-processing and Hybrid solution model. 'final_function_to_match' is the function to use to get fund mapping. diff --git a/core/auz_nz/string_similarity.py b/core/auz_nz/string_similarity.py new file mode 100644 index 0000000..65203bb --- /dev/null +++ b/core/auz_nz/string_similarity.py @@ -0,0 +1,77 @@ +import math +import re +from collections import Counter + +from fuzzywuzzy import fuzz + + +WORD = re.compile(r"\w+") + + +def text_to_vector(text): + words = WORD.findall(text) + return Counter(words) + +def get_cosine_similarity(str1: str, str2: str): + """ + Calculate the cosine similarity between two strings. + """ + try: + vec1 = text_to_vector(str1.lower()) + vec2 = text_to_vector(str2.lower()) + intersection = set(vec1.keys()) & set(vec2.keys()) + numerator = sum([vec1[x] * vec2[x] for x in intersection]) + sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())]) + sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())]) + denominator = math.sqrt(sum1) * math.sqrt(sum2) + if not denominator: + return 0.0 + else: + return float(numerator) / denominator + except Exception as e: + # print("error: ",e) + pass + return 0.0 + +def get_ngrams(text, n): + # Remove spaces and create a list of bigrams + text = text.replace(" ", "") # Remove spaces + return {text[i : i + n] for i in range(len(text) - 1)} + + +def get_jaccard_similarity(str1: str, str2: str) -> float: + """ + Calculate the jaccard similarity between two strings. + """ + try: + # Generate bigrams for each string + str1 = str1.lower() + str2 = str2.lower() + set1 = set(get_ngrams(str1, 2)) # Bigrams for str1 + set2 = set(get_ngrams(str2, 2)) # Bigrams for str2 + # Calculate intersection and union + intersection = len(set1.intersection(set2)) + union = len(set1.union(set2)) + # Compute Jaccard similarity + return intersection / union if union != 0 else 0.0 + except Exception as e: + # print("error: ",e) + pass + return 0.0 + +def get_levenshtien_distance_score(str1: str, str2: str) -> float: + """ + Calculate the levenshtein distance score between two strings. + """ + try: + str1 = str1.lower() + str2 = str2.lower() + similarity_score = fuzz.ratio(str1, str2) + try: + return similarity_score / 100 + except ZeroDivisionError as e: + return 0.0 + except Exception as e: + # print("error: ",e) + pass + return 0.0 \ No newline at end of file diff --git a/core/data_extraction.py b/core/data_extraction.py index 9fe3b46..f607f95 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -495,9 +495,10 @@ class DataExtraction: exclude_data, extract_way="text" ) - response, with_error = chat( - instructions, response_format={"type": "json_object"} + result, with_error = chat( + prompt=instructions, response_format={"type": "json_object"} ) + response = result.get("response", "") if with_error: logger.error(f"Error in extracting tables from page") data_dict = {"doc_id": self.doc_id} @@ -508,6 +509,9 @@ class DataExtraction: data_dict["raw_answer"] = response data_dict["extract_data"] = {"data": []} data_dict["extract_way"] = original_way + data_dict["prompt_token"] = result.get("prompt_token", 0) + data_dict["completion_token"] = result.get("completion_token", 0) + data_dict["total_token"] = result.get("total_token", 0) return data_dict try: data = json.loads(response) @@ -539,6 +543,9 @@ class DataExtraction: data_dict["raw_answer"] = response data_dict["extract_data"] = data data_dict["extract_way"] = original_way + data_dict["prompt_token"] = result.get("prompt_token", 0) + data_dict["completion_token"] = result.get("completion_token", 0) + data_dict["total_token"] = result.get("total_token", 0) return data_dict def extract_data_by_page_image( @@ -566,6 +573,9 @@ class DataExtraction: data_dict["raw_answer"] = "" data_dict["extract_data"] = {"data": []} data_dict["extract_way"] = "image" + data_dict["prompt_token"] = 0 + data_dict["completion_token"] = 0 + data_dict["total_token"] = 0 return data_dict else: if previous_page_last_fund is not None and len(previous_page_last_fund) > 0: @@ -610,9 +620,10 @@ class DataExtraction: exclude_data=exclude_data, extract_way="image" ) - response, with_error = chat( - instructions, response_format={"type": "json_object"}, image_base64=image_base64 + result, with_error = chat( + prompt=instructions, response_format={"type": "json_object"}, image_base64=image_base64 ) + response = result.get("response", "") if with_error: logger.error(f"Error in extracting tables from page") data_dict = {"doc_id": self.doc_id} @@ -623,6 +634,9 @@ class DataExtraction: data_dict["raw_answer"] = response data_dict["extract_data"] = {"data": []} data_dict["extract_way"] = "image" + data_dict["prompt_token"] = result.get("prompt_token", 0) + data_dict["completion_token"] = result.get("completion_token", 0) + data_dict["total_token"] = result.get("total_token", 0) return data_dict try: data = json.loads(response) @@ -644,15 +658,19 @@ class DataExtraction: data_dict["raw_answer"] = response data_dict["extract_data"] = data data_dict["extract_way"] = "image" + data_dict["prompt_token"] = result.get("prompt_token", 0) + data_dict["completion_token"] = result.get("completion_token", 0) + data_dict["total_token"] = result.get("total_token", 0) return data_dict def get_image_text(self, page_num: int) -> str: image_base64 = self.get_pdf_image_base64(page_num) instructions = self.instructions_config.get("get_image_text", "\n") logger.info(f"Get text from image of page {page_num}") - response, with_error = chat( - instructions, response_format={"type": "json_object"}, image_base64=image_base64 + result, with_error = chat( + prompt=instructions, response_format={"type": "json_object"}, image_base64=image_base64 ) + response = result.get("response", "") text = "" if with_error: logger.error(f"Can't get text from current image") diff --git a/core/data_mapping.py b/core/data_mapping.py index f76bf54..4218b5f 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -7,6 +7,7 @@ from utils.sql_query_util import ( query_investment_by_provider, ) from utils.logger import logger +from core.auz_nz.hybrid_solution_script import final_function_to_match class DataMapping: @@ -51,6 +52,7 @@ class DataMapping: self.doc_share_name_list = [] self.doc_fund_mapping = pd.DataFrame() self.doc_fund_class_mapping = pd.DataFrame() + self.provider_name = "" else: self.doc_fund_name_list = ( self.document_mapping_info_df["FundName"].unique().tolist() @@ -64,6 +66,7 @@ class DataMapping: self.doc_fund_class_mapping = self.document_mapping_info_df[ ["FundId", "SecId", "ShareClassName", "CurrencyId"] ].drop_duplicates() + self.provider_name = self.document_mapping_info_df["ProviderName"].values[0] logger.info("Setting provider mapping data") self.provider_mapping_df = self.get_provider_mapping() @@ -99,7 +102,129 @@ class DataMapping: provider_mapping_df = provider_mapping_df.drop_duplicates() provider_mapping_df.reset_index(drop=True, inplace=True) return provider_mapping_df - + + def mapping_raw_data_entrance(self): + if self.doc_source == "emear_ar": + return self.mapping_raw_data() + elif self.doc_source == "aus_prospectus": + return self.mapping_raw_data_aus() + else: + return self.mapping_raw_data() + + def mapping_raw_data_aus(self): + logger.info(f"Mapping raw data for AUS Prospectus document {self.doc_id}") + mapped_data_list = [] + # Generate raw name based on fund name and share name by integrate_share_name + fund_raw_name_list = [] + share_raw_name_list = [] + for page_data in self.raw_document_data_list: + doc_id = page_data.get("doc_id", "") + page_index = page_data.get("page_index", "") + raw_data_list = page_data.get("extract_data", {}).get("data", []) + for raw_data in raw_data_list: + raw_fund_name = raw_data.get("fund_name", "") + if raw_fund_name is None or len(raw_fund_name) == 0: + continue + raw_share_name = raw_data.get("share_name", "") + raw_data_keys = list(raw_data.keys()) + if len(raw_share_name) > 0: + integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name) + if integrated_share_name not in share_raw_name_list: + share_raw_name_list.append(integrated_share_name) + for datapoint in self.datapoints: + if datapoint in raw_data_keys: + mapped_data = { + "doc_id": doc_id, + "page_index": page_index, + "raw_fund_name": raw_fund_name, + "raw_share_name": raw_share_name, + "raw_name": integrated_share_name, + "datapoint": datapoint, + "value": raw_data[datapoint], + "investment_type": 1, + "investment_id": "", + "investment_name": "", + "similarity": 0 + } + mapped_data_list.append(mapped_data) + else: + if raw_fund_name not in fund_raw_name_list: + fund_raw_name_list.append(raw_fund_name) + for datapoint in self.datapoints: + if datapoint in raw_data_keys: + mapped_data = { + "doc_id": doc_id, + "page_index": page_index, + "raw_fund_name": raw_fund_name, + "raw_share_name": "", + "raw_name": raw_fund_name, + "datapoint": datapoint, + "value": raw_data[datapoint], + "investment_type": 33, + "investment_id": "", + "investment_name": "" + } + mapped_data_list.append(mapped_data) + # Mapping raw data with database + iter_count = 30 + fund_match_result = {} + if len(fund_raw_name_list) > 0: + fund_match_result = self.get_raw_name_db_match_result(fund_raw_name_list, "fund", iter_count) + logger.info(f"Fund match result: \n{fund_match_result}") + share_match_result = {} + if len(share_raw_name_list) > 0: + share_match_result = self.get_raw_name_db_match_result(share_raw_name_list, "share", iter_count) + logger.info(f"Share match result: \n{share_match_result}") + + for mapped_data in mapped_data_list: + investment_type = mapped_data["investment_type"] + raw_name = mapped_data["raw_name"] + if investment_type == 33: + if fund_match_result.get(raw_name) is not None: + matched_db_fund_name = fund_match_result[raw_name] + if matched_db_fund_name is not None and len(matched_db_fund_name) > 0: + # get FundId from self.doc_fund_mapping + find_fund_df = self.doc_fund_mapping[self.doc_fund_mapping["FundName"] == matched_db_fund_name] + if find_fund_df is not None and len(find_fund_df) > 0: + fund_id = find_fund_df["FundId"].values[0] + mapped_data["investment_id"] = fund_id + mapped_data["investment_name"] = matched_db_fund_name + mapped_data["similarity"] = 1 + if investment_type == 1: + if share_match_result.get(raw_name) is not None: + matched_db_share_name = share_match_result[raw_name] + if matched_db_share_name is not None and len(matched_db_share_name) > 0: + # get SecId from self.doc_fund_class_mapping + find_share_df = self.doc_fund_class_mapping[self.doc_fund_class_mapping["ShareClassName"] == matched_db_share_name] + if find_share_df is not None and len(find_share_df) > 0: + share_id = find_share_df["SecId"].values[0] + mapped_data["investment_id"] = share_id + mapped_data["investment_name"] = matched_db_share_name + mapped_data["similarity"] = 1 + + self.output_mapping_file(mapped_data_list) + return mapped_data_list + + def get_raw_name_db_match_result(self, raw_name_list, investment_type: str, iter_count: int = 30): + # split raw_name_list into several parts which each part is with 30 elements + # The reason to split is to avoid invoke token limitation issues from CahtGPT + raw_name_list_parts = [raw_name_list[i:i + iter_count] + for i in range(0, len(raw_name_list), iter_count)] + all_match_result = {} + for raw_name_list in raw_name_list_parts: + if investment_type == "fund": + match_result = final_function_to_match(doc_id=self.doc_id, + pred_list=raw_name_list, + db_list=self.doc_fund_name_list, + provider_name=self.provider_name) + else: + match_result = final_function_to_match(doc_id=self.doc_id, + pred_list=raw_name_list, + db_list=self.doc_share_name_list, + provider_name=self.provider_name) + all_match_result.update(match_result) + return all_match_result + def mapping_raw_data(self): """ doc_id, page_index, datapoint, value, @@ -218,6 +343,10 @@ class DataMapping: } mapped_data_list.append(mapped_data) + self.output_mapping_file(mapped_data_list) + return mapped_data_list + + def output_mapping_file(self, mapped_data_list: list): json_data_file = os.path.join( self.output_data_json_folder, f"{self.doc_id}.json" ) @@ -239,8 +368,6 @@ class DataMapping: extract_data_df.to_excel(writer, sheet_name="extract_data", index=False) except Exception as e: logger.error(f"Failed to save excel file: {e}") - - return mapped_data_list def integrate_share_name(self, raw_fund_name: str, raw_share_name: str): raw_name = "" diff --git a/core/data_translate.py b/core/data_translate.py index fb3f744..b3fca97 100644 --- a/core/data_translate.py +++ b/core/data_translate.py @@ -51,9 +51,10 @@ class Translate_PDF: instructions = f"Context: \n{text}\n\nInstructions: Translate the contex in {self.target_language}. \n" instructions += "Please output the translated text in the following JSON format: {\"translated_text\": \"translated text\"} \n\n" instructions += "Answer: \n" - response, with_error = chat( - instructions, response_format={"type": "json_object"} + result, with_error = chat( + prompt=instructions, response_format={"type": "json_object"} ) + response = result.get("response", "") try: data = json.loads(response) except: diff --git a/main.py b/main.py index c30af06..b07ef3f 100644 --- a/main.py +++ b/main.py @@ -16,6 +16,7 @@ from utils.biz_utils import add_slash_to_text_as_regex from core.page_filter import FilterPages from core.data_extraction import DataExtraction from core.data_mapping import DataMapping +from core.auz_nz.hybrid_solution_script import api_for_fund_matching_call from core.metrics import Metrics @@ -277,8 +278,9 @@ class EMEA_AR_Parsing: data_from_gpt, self.document_mapping_info_df, self.output_mapping_data_folder, + self.doc_source ) - return data_mapping.mapping_raw_data() + return data_mapping.mapping_raw_data_entrance() def filter_pages(doc_id: str, pdf_folder: str, doc_source: str) -> None: @@ -402,6 +404,7 @@ def batch_start_job( pdf_folder: str = "/data/emea_ar/pdf/", output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", doc_data_excel_file: str = None, + document_mapping_file: str = None, output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/", output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/", @@ -498,6 +501,16 @@ def batch_start_job( result_extract_data_df.to_excel( writer, index=False, sheet_name="extract_data" ) + + if document_mapping_file is not None and len(document_mapping_file) > 0 and os.path.exists(document_mapping_file): + try: + merged_total_data_folder = os.path.join(output_mapping_total_folder, "merged/") + os.makedirs(merged_total_data_folder, exist_ok=True) + data_file_base_name = os.path.basename(output_file) + output_merged_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) + merge_output_data_aus_prospectus(output_file, document_mapping_file, output_merged_data_file_path) + except Exception as e: + logger.error(f"Error: {e}") if calculate_metrics: prediction_sheet_name = "total_mapping_data" @@ -989,6 +1002,7 @@ def batch_run_documents( doc_source: str = "emea_ar", special_doc_id_list: list = None, pdf_folder: str = r"/data/emea_ar/pdf/", + document_mapping_file: str = None, output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/", @@ -1001,8 +1015,8 @@ def batch_run_documents( page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True calculate_metrics = False @@ -1027,6 +1041,7 @@ def batch_run_documents( pdf_folder, output_pdf_text_folder, page_filter_ground_truth_file, + document_mapping_file, output_extract_data_child_folder, output_mapping_child_folder, output_extract_data_total_folder, @@ -1046,6 +1061,7 @@ def batch_run_documents( pdf_folder, output_pdf_text_folder, page_filter_ground_truth_file, + document_mapping_file, output_extract_data_child_folder, output_mapping_child_folder, output_extract_data_total_folder, @@ -1178,7 +1194,7 @@ def merge_output_data_aus_prospectus( ): # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16 data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data") - document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="Sheet1") + document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping") # set doc_id to be string type data_df["doc_id"] = data_df["doc_id"].astype(str) document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str) @@ -1337,6 +1353,7 @@ if __name__ == "__main__": document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] + document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" # special_doc_id_list: list = [ # "539790009", # "542300403", @@ -1350,7 +1367,7 @@ if __name__ == "__main__": # "555377021", # "555654388", # ] - # special_doc_id_list: list = ["554851189"] + # special_doc_id_list: list = ["534287518"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( @@ -1366,10 +1383,12 @@ if __name__ == "__main__": r"/data/aus_prospectus/output/mapping_data/total/" ) drilldown_folder = r"/data/aus_prospectus/output/drilldown/" + batch_run_documents( doc_source=doc_source, special_doc_id_list=special_doc_id_list, pdf_folder=pdf_folder, + document_mapping_file=document_mapping_file, output_pdf_text_folder=output_pdf_text_folder, output_extract_data_child_folder=output_extract_data_child_folder, output_extract_data_total_folder=output_extract_data_total_folder, diff --git a/requirements.txt b/requirements.txt index f3839f2..df9f377 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,6 @@ pandas==2.2.3 openpyxl==3.1.2 XlsxWriter==3.1.2 tiktoken==0.7.0 -beautifulsoup4==4.12.3 \ No newline at end of file +beautifulsoup4==4.12.3 +fuzzywuzzy==0.18.0 +nltk==3.9.1 \ No newline at end of file diff --git a/utils/gpt_utils.py b/utils/gpt_utils.py index f5aa511..7f51995 100644 --- a/utils/gpt_utils.py +++ b/utils/gpt_utils.py @@ -10,10 +10,6 @@ import dotenv # loads .env file with your OPENAI_API_KEY dotenv.load_dotenv() -# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") -tokenizer = tiktoken.get_encoding("cl100k_base") - - def get_embedding(text, engine=os.getenv("EMBEDDING_ENGINE")): count = 0 error = "" @@ -32,6 +28,7 @@ def get_embedding(text, engine=os.getenv("EMBEDDING_ENGINE")): def num_tokens_from_string(string: str) -> int: + tokenizer = tiktoken.get_encoding("cl100k_base") """Returns the number of tokens in a text string.""" num_tokens = len(tokenizer.encode(string)) return num_tokens @@ -64,6 +61,7 @@ def num_tokens_from_messages(messages, model="gpt-35-turbo-16k"): def chat( prompt: str, + system_prompt: str = None, engine=os.getenv("Engine_GPT4o"), azure_endpoint=os.getenv("OPENAI_API_BASE_GPT4o"), api_key=os.getenv("OPENAI_API_KEY_GPT4o"), @@ -104,11 +102,14 @@ def chat( ] else: messages = [{"role": "user", "content": prompt}] + if system_prompt is not None and len(system_prompt) > 0: + messages.insert(0, {"role": "system", "content": system_prompt}) count = 0 - error = "" + result = {} request_timeout = 600 while count < 8: + response = None try: if count > 0: print(f"retrying the {count} time...") @@ -139,15 +140,25 @@ def chat( response_format=response_format, ) sleep(1) - return response.choices[0].message.content, False + result["full_response"] = response + result["response"] = response.choices[0].message.content + result["prompt_token"] = response.usage.prompt_tokens + result["completion_token"] = response.usage.completion_tokens + result["total_token"] = response.usage.total_tokens + return result, False except Exception as e: error = str(e) print(f"error message: {error}") if "maximum context length" in error: - return error, True + result["full_response"] = response + result["response"] = error + result["prompt_token"] = response.usage.prompt_tokens + result["completion_token"] = response.usage.completion_tokens + result["total_token"] = response.usage.total_tokens + return result, True count += 1 sleep(2) - return error, True + return result, True def encode_image(image_path: str):