diff --git a/app_emea_ar.py b/app_emea_ar.py index 105be43..8aab6d4 100644 --- a/app_emea_ar.py +++ b/app_emea_ar.py @@ -69,7 +69,8 @@ def emea_ar_data_extract(): output_extract_data_folder=output_extract_data_folder, output_mapping_data_folder=output_mapping_data_folder, extract_way=extract_way, - drilldown_folder=drilldown_folder) + drilldown_folder=drilldown_folder, + compare_with_provider=False) doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) doc_mapping_data = emea_ar_parsing.mapping_data( data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data diff --git a/configuration/emea_ar/abbreviation_records.json b/configuration/emea_ar/abbreviation_records.json new file mode 100644 index 0000000..10302cb --- /dev/null +++ b/configuration/emea_ar/abbreviation_records.json @@ -0,0 +1,1066 @@ +{ + "H": "High", + "(acc)": "(acc)", + "(dist)": "(dist)", + "perf": "(perf)", + "1167 Act Fds ICAV": "1167 Active Funds ICAV", + "Andln": "Aandelen", + "AB": "AB SICAV I", + "Aberdeen Global II": "Aberdeen Global II", + "Aberdeen Liqdty (Lux)": "Aberdeen Liquidity Fund (Lux)", + "Aberdeen Std": "Aberdeen Standard", + "AS SICAV I": "Aberdeen Standard SICAV I", + "ABN AMRO": "ABN AMRO Multi-Manager Funds ", + "AAMMF FoM": "ABN AMRO Multi-Manager Funds - Fund of Mandates", + "Abs": "Absolute", + "AI": "Absolute Insight", + "Acct": "Account", + "Acc": "Accumulation", + "Achrs": "Achievers", + "Actns": "Actions", + "Actv": "Active", + "Act": "Active", + "actvy": "actively", + "Actr": "Actuaries", + "Admt": "Adamant", + "Adj": "Adjustable", + "Adjs": "Adjusted", + "Admin": "Administrative", + "Adm": "Administrative", + "Advcd": "Advanced", + "Advnr": "Advancer", + "Advtg": "Advantage", + "Advts": "Advantus", + "Advnturs": "Adventurous", + "Adv": "Advisor", + "Advs": "Advisors", + "Arspc": "Aerospace", + "Afts": "Affiliates", + "Afr": "Africa", + "Agcy": "Agency", + "Aggt": "Aggregate", + "Agrsv": "Aggressive", + "AgriBsns": "AgriBusiness", + "Agril": "Agricultural", + "Agc": "Agricultural", + "Agltr": "Agriculture", + "Agr": "Agriculture", + "Agri": "Agrinvest", + "Abg": "Airbag", + "Air": "Airlines", + "Akcvy": "Akciovy", + "Akt": "Aktier", + "Albtrs": "Albatross", + "Allnc": "Alliance", + "AllncBrnstn": "AllianceBernstein", + "Allc": "Allocation", + "Allctr": "Allocator", + "Alp": "Alpha", + "Alt": "Alternative", + "Alts": "Alternatives", + "Ambt": "Ambition", + "Amer": "American", + "Am": "American", + "AFS": "Amundi Fund Solutions -", + "Amundi Fds": "Amundi Funds", + "Amundi Fds II": "Amundi Funds II -", + "Amundi IS": "Amundi Index Solutions - Amundi Index", + "Amundi Mny Mkt Fd": "Amundi Money Market Fund", + "Amundi SF": "Amundi S.F. -", + "Angl": "Angel", + "Ann": "Annual", + "ADis": "Annually Distribution", + "AD": "Annually Distribution", + "ATA": "ANTARCTICA", + "AntiBench": "Antibenchmark", + "App": "Appliances", + "Apprec": "Appreciation", + "Appr": "Approche", + "Apl": "April", + "Arbn": "Arabian", + "Arbtrg": "Arbitrage", + "Ar": "Area", + "ARM": "ARMENIA", + "Artfcl": "Artificial", + "AsiaPac": "Asia Pac\u00edfic", + "APAC": "Asia Pac\u00edfic", + "Asn": "Asian", + "Asst": "Asset", + "Ast": "Asset", + "Assts": "Assets", + "Assrd": "Assured", + "Audentia Capital": "Audentia Capital SICAV PLC", + "Ausgwn": "Ausgewogen", + "AUS": "Australian", + "Autcll": "Autocallable", + "Atmt": "Automated", + "Atmtn": "Automation", + "Avant": "Avantage", + "Avrg": "Average", + "Avg": "Average", + "Aviva Investors": "Aviva Investors", + "Awrns": "Awareness", + "AXAIMFIIS": "AXA IM Fixed Income Investment Strategies", + "AXAWF": "AXA World Funds", + "AXAWF II": "AXA World Funds II", + "Bckd": "Backed", + "Bkwrdt": "Backwardated", + "Baillie Gifford WW": "Baillie Gifford Wldwd", + "Bal": "Balanced", + "Bk": "Bank", + "Bkg": "Banking", + "BL": "Banque de Luxembourg", + "Bantleon sel": "Bantleon select", + "Bcly": "Barclays", + "Bsn": "Basin", + "Behvrl": "Behavioral", + "Bench": "Benchmark", + "Bnf": "Benefit", + "Bt": "Beta", + "Bvrg": "Beverage", + "Biotech": "Biotechnology", + "BlkRk": "BlackRock", + "BR": "BlackRock", + "Blnd": "Blended", + "Blmbrg": "Bloomberg", + "BBMSCI": "Bloomberg Barclays MSCI", + "Bl Chp": "Blue Chip", + "BNPP B Strategy": "BNP Paribas B Strategy", + "BNY": "BNY Mellon Liquidity Funds PLC", + "Bd": "Bond", + "Bds": "Bonds", + "Bnsm": "Bonusom", + "Bstr": "Booster", + "Brds": "Brands", + "BrdwnGLB": "BrandywineGLOBAL", + "BRA": "Brazil", + "Bnt": "Brent", + "Bdgw": "Bridgeway", + "Brd": "Broad", + "Bdpst": "Budapest", + "Bldr": "Builder", + "Bnd": "Bundle", + "CaixaBank Global": "CaixaBank Global SICAV", + "CA": "California", + "Cll": "Call", + "Cnd": "Canada", + "CAD": "Canadian Dollar ", + "Candriam Bds": "Candriam Bonds", + "Candriam Eqs B": "Candriam Equities B", + "Candriam Eqs L": "Candriam Equities L", + "Candriam Sustainable": "Candriam Sustainable", + "Cp": "Cap", + "Cptl": "Capital", + "CapitalatWork": "Capitalatwork Foyer Umbrella", + "Cap": "Capitalisation", + "Cpd": "Capped", + "Cps": "Caps", + "Carb": "Carbon", + "Cr": "Core", + "Carmignac Pf": "Carmignac Portfolio", + "Csh": "Cash", + "Cau": "Cautious", + "Cdl": "Cedola", + "Cntrl": "Central", + "Cntrc": "Centric", + "Crtfcts": "Certificates", + "Csky": "Cesky", + "Chg": "Change", + "Chrts": "Charities", + "CHN": "CHINA", + "Chns": "Chinese", + "CNY": "Chinese Yuan ", + "Chp": "Chip", + "Chc": "Choice", + "Cts": "Cities", + "Cl": "Classic", + "Clmt": "Climate", + "Cls": "Close", + "Cld": "Cloud", + "Cogntv": "Cognitive", + "Coll": "Collateralized", + "Cllctn": "Collection", + "Cllctv": "Collective", + "Collect": "Collectivit\u00e9s", + "Col": "COLOMBIA", + "Colord": "Colorado", + "Cmfrt": "Comfort", + "Comrcl": "Commercial", + "Commercial": "Commercialization", + "Cmdts": "Commodities", + "Cmdty": "Commodity", + "Cmd": "Commodity", + "CIF": "Common Investment Fund", + "Comm": "Communication", + "Cie": "Compagnie", + "Cies": "Compagnies", + "Coms": "Companies", + "Com": "Company", + "Cmp": "Compass", + "Cmplt": "Complete", + "Compnt": "Component", + "Comps": "Composite", + "Cmprhsv": "Comprehensive", + "Comp": "Computer", + "Cmptg": "Computing", + "Concntr": "Concentrated", + "Concpt": "Concept", + "Cndtnl": "Conditional", + "Cnsrv": "Conservative", + "Cnsv": "Conservative", + "CsvtCvtb": "Conservative Convertible", + "Convr": "Conserver", + "Cons": "Consolidado", + "Cnstnt": "Constant", + "Constnd": "Constrained", + "Constr": "Constraint", + "Const": "Construction", + "Cnsmr": "Consumer", + "Continen": "Continental", + "Contnn": "Continental", + "Cntgnt": "Contingent", + "Contra": "Contrarian", + "Ctrl": "Control", + "Contrvrsl": "Controversial", + "Cont": "Controversial", + "Cnvrt": "Convertible", + "Convert": "Convertibles", + "Cnvrts": "Convertibles", + "Convex": "Convexit\u00e9", + "Convct": "Conviction", + "Convict": "Convictions", + "Coop": "Cooper", + "Cor": "Core", + "Corp": "Corporates", + "CrpBdIdx": "Corporate Bond Index", + "Countrs": "Countries", + "Cntry": "Country", + "Cpn": "Coupon", + "Cov": "Covered", + "Crct": "Creciente", + "Crdt": "Credit", + "CS": "Credit Suisse", + "CSV SIF": "Credit Suisse Virtuoso SICAV - SIF", + "Cre": "Creek", + "Crsvr": "Crossover", + "Crd": "Crude", + "Cum": "Cumulative", + "Ccis": "Currencies", + "Ccy": "Currency", + "Cust": "Custom", + "Custmzd": "Customized", + "Cycl": "Cyclicals", + "Dl": "Daily", + "Danske FoF": "Danske Fund of Funds", + "Danske Invest Allc": "Danske Invest Allocation SICAV", + "Danske Invest": "Danske Invest SICAV", + "Dt": "Date", + "Dtd": "Dated", + "db": "Db", + "db AM": "db Advisory Multibrands", + "De": "Death", + "Dbt": "Debt", + "Dfnc": "Defence", + "Dfndr": "Defender", + "Defensv": "Defensive", + "Defesv": "Defensive", + "Dfnd": "Defined", + "Dlt": "Delta", + "Delta Lloyd L": "Delta Lloyd L", + "Dmnd": "Demand", + "Dmgrphcs": "Demographics", + "Demgrph": "Demography", + "Dmgr": "Demography", + "Dnmntd": "Denominated", + "Dpsts": "Deposits", + "Dsgntd": "Designated", + "Deutlnd": "Deutschland", + "Dev": "Developed", + "Devpg": "Developing", + "Devpmt": "Development", + "Dgnstcs": "Diagnostics", + "Dgtl": "Digital", + "Dimsnl": "Dimensional", + "Drt": "Direct", + "Dir": "Direct", + "Discplnd": "Disciplined", + "Dscnt": "Discount", + "Discvs": "Discoveries", + "Discv": "Discovery", + "Discret": "Discretion", + "Disctnry": "Discretionary", + "Disc": "Discretionary", + "Disrpt": "Disruptive ", + "Dsrpt": "Disruptive ", + "Dis": "Distribution", + "Divers": "Diversified", + "Div": "Dividend", + "Dvrs": "Diversified", + "Divst": "Diversity", + "DP": "Dividend Payout", + "DR": "Dividend Reinvestment", + "Divs": "Dividends", + "Dlhps": "Dluhopisu", + "Dbyvtl": "Dobyvatelia", + "Dllr": "Dollar", + "Domst": "Domestic", + "DPAM Capital B": "DPAM Capital B", + "Drvn": "Driven", + "Durb": "Durable", + "Drbl": "Durables", + "Dur": "Duration", + "Dyn": "Dynamic", + "Erns": "Earnings", + "Estn": "Eastern", + "Eaton Vance Intl (CYM)": "Eaton Vance International (Cayman Islands)", + "Eaton Vance Intl (IRL)": "Eaton Vance International (Ireland)", + "Ecomm": "Ecommerce ", + "Eco": "Economie", + "Ecos": "Economies", + "Ecoy": "Economy", + "Edu": "Education", + "Elevation Fds (IE)": "Elevation UCITS Funds (Ireland) ICAV", + "E": "Elite", + "Emgnt": "Emergente", + "Em": "Emerging", + "Emerg": "Emerging", + "Em Mkts": "Emerging Markets", + "EM": "Emerging Markets", + "Emply": "Employee", + "Emplmt": "Employment", + "Empwrmt": "Empowerment", + "Endwmnt": "Endowment ", + "Endur": "Endurance", + "Eggm": "Eneagement", + "Engy": "Energy", + "Ey": "Energy", + "Eng": "English", + "Enh": "Enhanced", + "EnhFxIn": "Enhanced Fixed Income", + "Entrprs": "Entrepreneurs", + "Ents": "Enterprises", + "Entrepr": "Entreprendre", + "Entrprnrs": "Entrepreneurs", + "Entr": "Entreprise", + "Envir": "Environment", + "Envirtly": "Environmentally", + "Epch": "Epoch", + "Epsilon Fund": "Epsilon Fund", + "Eqlty": "Equality", + "Eqs": "Equities", + "Eq": "Equity", + "Essential Port Sel": "Essential Portfolio Selection", + "Estblshd": "Established", + "Estt": "Estate", + "Etcl": "Ethical", + "EUR": "EUR", + "\u20ac": "EUR", + "Euro": "Euro", + "EURO": "EURO", + "Eurbl": "Eurobloc", + "Eurodoll": "Eurodollar", + "Eurlnd": "Euroland", + "Eurp": "European", + "Erst": "Eurostoxx", + "Euroz": "Eurozone", + "Ev": "Event", + "Evol": "Evolutif", + "Evvg": "Evolving", + "Excld": "Excluding", + "Excl": "Exclusif", + "Exclsv": "Exclusive", + "Exm": "Exempt", + "Expc": "Expectations", + "Expts": "Expertise", + "Exptrs": "Exporters", + "Exps": "Exps", + "ext": "extend", + "Extnl": "External", + "Extr": "Extra", + "Fac": "Factor", + "Facs": "Factors", + "Flln": "Fallen", + "Fam": "Familiales", + "Fml": "Familie", + "FCP": "Fcp", + "Feb": "February", + "Fedrtd": "Federated", + "Fdr": "Founder", + "Fidelity": "Fidelity Funds", + "Fid": "Fiduciary", + "Finac": "Finance", + "Fincl": "Financial", + "Fincls": "Financials", + "Fi": "Financials", + "Fndr": "Finder", + "Fst": "First", + "Fxd": "Fixed", + "Flx": "Flex", + "Flex": "Flexible", + "Fltng": "Floating", + "Fl": "Floating", + "Flrd": "Floored", + "Foc": "Focus", + "Fcs": "Focused", + "Focused": "Focused SICAV", + "Fstg": "Forstrong", + "Fortn": "Fortnightly", + "FH Aberdeen Global": "Forvaltningshuset Aberdeen Global", + "Fssl Ful": "Fossil Fuel", + "Fndtn": "Foundation", + "Fdrs": "Founders", + "Frmlgtn": "Framlington", + "Frm": "Framlington", + "Fran": "Franchise", + "Frk Flx Er Agt Bd": "Franklin Flexible Euro Aggregate Bond", + "Frntr": "Frontier", + "Frtl": "Frontline ", + "Fd": "Fund", + "FoF": "Fund of Funds", + "Fdmtl": "Fundamental", + "Fdml": "Fundamental", + "Fds": "Funds", + "Fut": "Future", + "Futs": "Futures", + "GAM": "Gam", + "Garant": "Garantizado", + "Gtmr": "Gartmore", + "GBP": "GBP", + "\u00a3": "GBP", + "Gndr": "Gender", + "Gen": "General", + "Grmny": "Germany", + "Gestielle": "Gestielle Investment SICAV", + "Gest": "Gestion", + "Gnts": "Giants", + "Glt": "Gilts", + "Glbl": "Global", + "Glb": "Global", + "GDOF": "Global Dynamic Opportunities Fund Ltd.", + "GEI": "Global Equity Income ", + "GlInGd": "Global Investment Grade", + "Glble": "Globale", + "Glblnch": "Globalnich", + "GS": "Goldman Sachs", + "GSF II": "Goldman Sachs Funds II", + "Goodbody": "Goodbody Platform ICAV", + "Gouvrmntls": "Gouvernementales", + "Govt": "Government", + "Gvs": "Govies", + "Grd": "Grade", + "Grde": "Grande", + "Grter": "Greater", + "Grs Inc": "Gross Income", + "GI": "Gross Income", + "GP": "Gross Paying", + "Grp": "Group", + "Grwr": "Growers", + "Gr": "Growth", + "GSQrtx": "GSQuartix", + "Grtd": "Guaranteed", + "HY": "High Yield", + "Hpshr": "Hampshire", + "Hrd": "Hard", + "Hrdwr": "Hardware", + "Hdstrt": "Headstart", + "Hlth": "Health", + "Hlthcare": "Healthcare", + "Hlthcr": "Healthcare", + "Hdg": "Hedged", + "HF": "Hedged Fund", + "Hndrsn Pn": "Henderson Pan", + "Heptagon": "Heptagon Fund PLC", + "Hereford Fds": "Hereford Funds", + "Hertg": "Heritage", + "Hxvt": "Hexavest", + "Hi": "High", + "Hi Yld": "High Yield", + "Hldg": "Holding", + "HKD": "Hong Kong Dollar ", + "Hrzn": "Horizon", + "Hosptlty": "Hospitality", + "Human": "Humanisme", + "Hngrn": "Hungarian", + "Hyb": "Hybrid", + "Impct": "Impact", + "Imp": "Impact", + "Incld": "Including", + "Incl": "Including", + "Inc": "Income", + "Indep": "Independence", + "Idx": "Index", + "Idxd": "Indexed", + "Idxng": "Indexing", + "INR": "Indian Rupee ", + "Individualnh": "Individualniho", + "IDR": "Indonesian Rupiah ", + "Industr": "Industralized", + "Indstr": "Industrials", + "Inds": "Industries", + "Infl": "Inflation", + "Infor": "Information", + "Info": "Information", + "Infmd": "Informed", + "Infras": "Infrastructure", + "Initl": "Initial", + "Innovt": "Innovation", + "Innvtv": "Innovative", + "Innovtr": "Innovators", + "Insnstv": "Insensitive", + "Insim": "Insieme", + "Insgts": "Insights", + "Inst": "Institution", + "Instl": "Institutional", + "Ins": "Insurance", + "Insts": "Institutions", + "Intllgnc": "Intelligence", + "Intlgc": "Intelligence", + "Intst": "Interest", + "IntrR": "Interest Rate", + "Intmdt": "Intermediate", + "Itmt": "Intermediate", + "Intl": "International", + "Internat": "Internationales", + "Inter": "Interval", + "Intrs": "Intrinsic", + "Invrs": "Inverse", + "Inv": "Investors", + "Investec GSF": "Investec Global Strategy Fund", + "Investec SIV": "Investec Series IV", + "Invt": "Investing", + "Invmt": "Investment", + "Invm": "Investment", + "IG": "Investment Grade", + "Invmts": "Investments", + "Invms": "Investments", + "Irl": "Ireland", + "IEP": "Irish Pound ", + "Issr": "Issuer", + "ITA": "Italy", + "JPM": "JPMorgan Liquidity Funds", + "Jan": "Janvier", + "Jpn": "Japanese", + "JPY": "Japanese Yen ", + "JPM ISF II": "JPMorgan Investment Strategies Funds II", + "JPMF": "JPMorgan Portfolio Strategies Funds", + "Jmpr": "Jumper", + "Jmpstr": "Jumpstart", + "Kairos Alpha SICAV": "Kairos Alpha SICAV", + "Kairos Intl SICAV": "Kairos International SICAV", + "KBC Eq Fd": "KBC Equity Fund", + "KBC Index Fd": "KBC Index Fund", + "KBC Instl Fd": "KBC Institutional Fund ", + "KBC Master Fd": "KBC Master Fund", + "KBC Renta": "KBC Renta", + "KBC Select Immo": "KBC Select Immo", + "KH": "Kleinwort Hambros ", + "Knzvtvn": "Konzervativni", + "Krtkdbh": "Kratkodobych", + "Ldrd": "Laddered", + "Lrg": "Large", + "Lgr": "Larger", + "LA": "Latin America", + "Lattd": "Latitude", + "Ldrs": "Leaders", + "Ldrsp": "Leadership", + "Ldg": "Leading", + "Lrng": "Learning", + "Lvl": "Level", + "Lvrg": "Leverage", + "Lvrgd": "Leveraged", + "LGIP": "LGIP Funds (Lux)", + "Lf": "Life", + "Lfstyl": "Lifestyle", + "Ltd": "Limited", + "Lnkd": "Linked", + "Liqd": "Liquid", + "Liq": "Liquidez", + "Liqdty": "Liquidity", + "Lqdty": "Liquidity", + "Lstd": "Listed", + "Lvstk": "Livestock", + "Lvg": "Living", + "Ln": "Loan", + "Lns": "Loans", + "Lcl": "Local", + "LO Funds III": "Lombard Odier Funds III", + "LO Selection": "Lombard Odier Selection", + "LSE": "London Stock Exchange", + "Lng": "Long", + "Lg": "Long", + "L S": "Long Short", + "L/S": "Long/Short", + "L/T": "Long Term", + "L-S": "Long-Short", + "Lw": "Low", + "LRWgt": "Low Risk Weighted", + "LUX": "Luxembourg", + "Lyn": "Lynch", + "Lyxor Invmt Fds": "Lyxor Investment Funds", + "Lyxor Newcits II Plc": "Lyxor Newcits II Plc", + "Macq": "Macquarie", + "Mac": "Macro", + "Mntn": "Maintain", + "MAS": "Malaysia", + "Mgd": "Managed", + "Mgmt": "Management", + "Mgr": "Manager", + "Mgrs": "Managers", + "Manu": "Manulife", + "Manulife GF": "Manulife Global Fund", + "Mkt": "Market", + "MN": "Market Neutral", + "MNP": "Market Neutral Portfolio", + "Mkts": "Markets", + "Mkwd": "Marketwide", + "Mtr": "Master", + "Mstr": "Masters", + "Machg": "Matching", + "Matrls": "Materials", + "Mat": "Maturity", + "Mxmsr": "Maximiser", + "Med": "Mediterranean", + "Mdm": "Medium", + "M/T": "Medium Term", + "Mgtrnd": "Megatrend", + "Mrg": "Merger", + "Mrl": "Merrill", + "Mtl": "Metal", + "Mtls": "Metals", + "MFS Inv": "MFS\u00ae Investment Funds", + "MFS Meridian": "MFS\u00ae Meridian Funds", + "Md": "Mid", + "Mdl": "Middle", + "Mnrs": "Miners", + "Min": "Minimum", + "Mng": "Mining", + "MnRsk": "MinRisk", + "Mirabaud": "Mirabaud Luxembourg SIF", + "Mitlnd": "Mittelstand", + "Mod": "Moderate", + "Mdfd": "Modified", + "Momt": "Momentum", + "Mny": "Money", + "Mth": "Month", + "Mn": "Monthly", + "Mthly": "Monthly", + "Mly": "Monthly", + "MDis": "Monthly Distribution", + "MD": "Monthly Distribution", + "Mnmnt": "Monument", + "Mt": "Monument", + "Moorea Fd": "Moorea Fund", + "MS INVF": "Morgan Stanley Investment Funds", + "Mortg": "Mortgage", + "Mlt": "Multi", + "Multi Challenge": "Multi Challenge SICAV", + "MltAdv": "Multiadvisers", + "Mltalt": "Multialternative", + "Mltast": "Multiasset", + "Mlt-Asst": "Multi-Asset", + "MA": "Multi-Asset", + "Multicoop": "Multicooperation", + "Mltfct": "Multifactor", + "Mlt-Mgr": "Multi-Manager", + "MltOpps": "Multiopportunities SICAV", + "Multipartner": "Multipartner SICAV", + "RobecoSAM": "Multipartner SICAV - RobecoSAM", + "Mltplr": "Multiplier", + "Mltsct": "Multisector", + "MSMM": "Multi-Style, Multi-Manager SICAV Funds plc", + "MU Lux": "MULTI-UNITS LUXEMBOURG", + "Muncpl": "Municipal", + "Mut": "Mutual", + "Myfd": "MY.fund", + "Ntnl": "National", + "Natrl": "Natural", + "Nat": "Naturelles", + "Nbg Bm": "Neuberger Berman", + "Nflz": "Neuflize", + "Netrl": "Neutral", + "New Capital": "New Capital Fund Lux", + "Nwtn": "Newton", + "NN (B) Invest": "NN (B) Invest", + "NN (L) Intl": "NN (L) International", + "NN (L) Pat": "NN (L) Patrimonial", + "NVt": "Non-Voting", + "Nordea 1 -": "Nordea 1 -", + "Nrm": "Normal", + "Nrth": "North", + "NT": "Northern Trust Ucits Common Contractual Fund", + "Nor": "NORWAY", + "NOK": "Norwegian Krone ", + "Nov": "November", + "Nvych": "Novych", + "O\u2019Sh": "O\u2019Shaughnessy", + "Oblig": "Obligatie", + "Obl": "Obligationer", + "Oct": "Octobre", + "Off": "Offensiv", + "Offsh": "Offshore", + "Op": "Open", + "Oppc": "Opportunistic", + "Opports": "Opportunities", + "Opps": "Opportunities", + "Opp": "Opportunity", + "Optm": "Optimum", + "Optd": "Optimised", + "Optr": "Optimiser", + "Optmzr": "Optimizer", + "Optimum": "Optimum Fund", + "Opt": "Option", + "Ord": "Ordinary", + "Ori": "Orient", + "Oth": "Other", + "Ovrs": "Overseas", + "Ovrwrtg": "Overwriting", + "Ownshp": "Ownership", + "Pac": "Pacific", + "Ps": "Paesi", + "Prmtrc": "Parametric", + "Paty": "Parity", + "Part": "Partenaires", + "Prtly": "Partially", + "PtH": "Partially-Hedged", + "Ptcpt": "Participant", + "Partic": "Participation", + "Ptnr": "Partner", + "Ptnrs": "Partners", + "PGLI": "Partners Group Listed Investments SICAV", + "Partners Group Listed": "Partners Group Listed Investments SICAV - Listed", + "Pasv": "Passive", + "Patrim": "Patrimoine", + "Patriml": "Patrimonial", + "Py": "Pay", + "Pyt": "Payout", + "P2P": "Peer to Peer", + "Pensn": "Pension", + "Pen": "Pension", + "Perf": "Performance", + "Perfm": "Performers", + "Prdic": "Periodic", + "Prd": "Periodo", + "Perpt": "Perpetual", + "Psnl": "Personal", + "Phrm": "Pharma", + "Phrmctls": "Pharmaceuticals", + "PI Inv": "PI Investment Funds", + "PIMCO": "PIMCO", + "PIMCO IRL": "PIMCO Funds Ireland PLC", + "PIMCO GIS": "PIMCO Funds: Global Investors Series plc", + "PIMCO Sel": "PIMCO Select Funds PLC", + "Pinr": "Pioneer", + "Pvvrv": "Pivovarov", + "Pln": "Plan", + "Pltnm": "Platinum", + "Plato IIF": "Plato Institutional Index Fund", + "Plyrs": "Players", + "plc": "plc.", + "Pl": "Pool", + "Polar Cap": "Polar Capital Funds PLC", + "Plcy": "Policy", + "Pld": "Pooled", + "Port": "Portfolio", + "Ptf": "Portfolio", + "Pstv": "Positive", + "Pwr": "Power", + "Prec": "Precious", + "PM": "Precious Metals", + "Prfrnc": "Preference ", + "Pref": "Preferred", + "Pre": "Premia", + "Prem": "Premier", + "Prm": "Premium", + "Presv": "Preservation", + "Prstg": "Prestige", + "Prc": "Price", + "Prcng": "Pricing", + "Prlztst": "Prilezitosti", + "Pr": "Prime", + "Princ": "Principal", + "Principal": "Principal Global Investors Funds", + "Priv": "Private", + "PBFI": "Private Bank Funds I", + "Privl": "Privilege", + "Prcss": "Process", + "Prod": "Products", + "Prfl": "Profile", + "Prog": "Progressif", + "Prgrv": "Progressive", + "Prpty": "Property", + "Protec": "Protecci\u00f3n", + "Protd": "protected", + "Prot": "Protection", + "Prvds": "Providus", + "Prdnt": "Prudente", + "Pru": "Prudential", + "PCFS": "Pure Capital Fund SICAV", + "Pure SIF SA": "Pure SICAV-SIF S.A.", + "PtWrt": "PutWrite", + "PW": "PutWrite", + "Qual": "Quality", + "Qul": "Quality", + "Qntmtl": "Quantamental", + "Quant": "Quantitative", + "Quants": "Quantitatives", + "Qt": "Quarterly", + "QDis": "Quarterly Distribution", + "QD": "Quarterly Distribution", + "Quoniam Fds Sel": "Quoniam Funds Selection SICAV", + "RAMS": "RAMS Investment Unit Trust", + "Rt": "Return", + "Rts": "Reuters", + "Rl": "Real", + "RE": "Real Estate", + "Rl Rt": "Real Return", + "Rms": "Reams", + "Rsnbl": "Reasonable", + "Rebal": "Rebalance", + "Rcvy": "Recovery", + "Red Arc Glb Invms": "Red Arc Global Investments (Ireland) ICAV", + "Rgnl": "Regional", + "Rglr": "Regular", + "Reg": "Regular", + "Relatv": "Relative", + "Rlx": "Relax", + "Rendim": "Rendimiento", + "Rnt": "Renta", + "RF": "Renta Fija", + "Renta": "Rentabilit\u00e9", + "Rsrch": "Research", + "Rsh": "Research", + "Rsrv": "Reserves", + "Res": "Resources", + "responsibility": "responsAbility SICAV (Lux)", + "Rspnb": "Responsible", + "Resrs": "Ressources", + "Restrc": "Restricted", + "Rstrcng": "Restructuring", + "Retl": "Retail", + "Ret": "Return", + "Retrs": "Reuters", + "Rev": "Revenue", + "Rvvl": "Revival", + "Revolt": "Revolution", + "Rsg": "Rising", + "Rsk": "Risk", + "Rd": "Road", + "Rds": "Roads", + "Rbtc": "Robotics", + "Rdn": "Rodina", + "RLBF II": "Royal London Bond Funds II ICVC", + "RCCF": "Russell Common Contractual Fund", + "RIQIC Fund plc": "Russell Investments Qualifying Investor China Fund plc", + "S&P": "S&p", + "Sat": "Satellites", + "Satsftn": "Satisfaction", + "Svg": "Saving", + "Schroder AS": "Schroder Alternative Solutions", + "Schroder GAIA": "Schroder GAIA", + "Schroder GAIA II": "Schroder GAIA II", + "Schroder ISF": "Schroder International Selection Fund", + "Schroder Invmt Fd": "Schroder Investment Fund", + "Schroder Sel": "Schroder Selection", + "Schroder SMBC Glb Bd": "Schroder SMBC Global Bond Series", + "Schroder SSF": "Schroder Special Situations Fund", + "Sci": "Scientific", + "Scintfc": "Scientific", + "Scrd": "Scored", + "Scrn": "Screened", + "Sect": "Sectors", + "Secu": "Secure", + "Secs": "Securities", + "Scs": "Securities", + "Sctsd": "Securitised", + "Sctzd": "Securitized", + "Sec": "Security", + "Sgrgtd": "Segragated", + "SEI GAF": "SEI Global Assets Fund plc - The SEI", + "SEI GIF": "SEI Global Investments Fund Plc - The SEI", + "SEI GMF ": "SEI Global Master Fund plc - The SEI", + "Selec": "Selecci\u00f3n", + "Sel": "Selectis", + "Slctv": "Selective", + "Sdis": "Semi-annual Distribution", + "SD": "Semi-annual Distribution", + "Sr": "Senior", + "Ser": "Series", + "Svc": "Service", + "Svd Plfm": "Serviced Platform SICAV", + "Svcs": "Services", + "Shckltn": "Shackleton", + "SSE": "Shanghai Stock Exchange", + "Shr": "Share", + "Shld": "Shareholder", + "Shrs": "Shares", + "Shrh": "Shariah ", + "ShelteR Invest": "ShelteR Invest", + "Shrt": "Short", + "Short Dur": "Short Duration", + "Shrt Dur": "Short Duration", + "S/T": "Short-Term", + "SICAV": "sicav", + "Smplty": "Simplicity", + "SGD": "Singapore Dollar ", + "Sits": "Situations", + "Sivek": "Sivek", + "Skyline": "Skyline Umbrella Fund ICAV", + "Slv": "Sleeve", + "Sm": "Small", + "S&M": "Small & Mid", + "Sm Cp": "Small Caps", + "SmCp": "SmallCap", + "Sm-Cp": "Small-Cap", + "Smlr": "Smaller", + "Smrt": "Smart", + "SB": "Smart Beta", + "Smrtfd": "Smartfund", + "Smd": "Smid", + "Sclly": "Socially", + "ScllyAwr": "Socially Aware", + "Sftwr": "Software", + "Solid": "Solidaire", + "Sodty": "Solidarity", + "Solu": "Solutions", + "Sostnbl": "Sostenible", + "Souv": "Souverain", + "Sov": "Sovereigns", + "Svrgn": "Soverign", + "Spec": "Special", + "Spctm": "Spectrum", + "Sptlt": "Spotlight", + "Sqr": "Square", + "Stblty": "Stability", + "Stbl": "Stable", + "Std": "Standard", + "SLI": "Standard Life Investments Global SICAV", + "SLI GS II": "Standard Life Investments Global SICAV II", + "Stp": "Staples", + "Strt": "Start", + "Stt Strt": "State Street", + "Statstcl": "Statistical", + "Stpng": "Steepening", + "Stlg": "Sterling", + "Stwdsp": "Stewardship", + "Stk": "Stock", + "Stckpckr": "Stockpicker", + "Stks": "Stocks", + "Strat": "Strategy", + "Strats": "Strategies", + "Stgy": "Strategy", + "Struct": "Structured", + "Strctr": "Structures", + "Sbctnt": "Subcontinent", + "SbFd": "Sub-Fund", + "Sub": "Subsector", + "Skk ": "Sukuk ", + "Spr": "Super", + "ST Plus": "Super Trust Plus", + "Sprntnl": "Supranational", + "Srpls": "Surplus", + "Sustnby": "Sustainability", + "Sstby": "Sustainability", + "Sust": "Sustainable", + "Sst": "Sustainable", + "SEK": "Swedish Krona ", + "Sw": "Sweep", + "Swisscanto (LU) BF": "Swisscanto (LU) Bond Fund", + "Swisscanto (LU) EF": "Swisscanto (LU) Equity Fund", + "Swisscanto (LU) MMF": "Swisscanto (LU) Money Market Fund", + "Swisscanto (LU) PF": "Swisscanto (LU) Portfolio Fund", + "Switz": "Switzerland", + "Symphonia": "Symphonia Lux SICAV", + "Sntgm": "Syntagma", + "Sys": "System", + "Systmtc": "Systematic", + "Sysmc": "Systematic", + "T. Rowe Price": "T. Rowe Price Funds SICAV", + "Tact": "Tactical", + "Tailrd": "Tailored", + "Trgt": "Target", + "Tech": "Technology", + "Techs": "Technologies", + "Tchs": "Technologies", + "Tele": "Telecom", + "Telecms": "Telecommunications", + "Tmpltn": "Templeton", + "Trm": "Termine", + "Thm": "Thomson", + "Thms": "Themes", + "Thmsn": "Thomson", + "Thr Brdg Eurp": "Three Bridges Europe", + "Tilney ICAV": "Tilney Umbrella A ICAV", + "Tmng": "Timing", + "Ttl": "Total", + "TR": "Total Return", + "Trk": "Track", + "Trkr": "Tracker", + "Trdbl": "Tradable", + "Trd": "Trade", + "Trdg": "Trading", + "Trnh": "Tranche", + "Trsctn": "Transaction", + "Trans": "Transamerica", + "Transfmt": "Transformational", + "Trnsfm": "Transformational", + "Trsptn": "Transportation", + "Treas": "Treasuries", + "Trs": "Treasury", + "Trnd": "Trend", + "Trnds": "Trends", + "Trndy": "Trendy", + "Trhv": "Trhov", + "Trl": "Trials", + "Trif": "Triflex", + "Tr": "Trust", + "Trnard": "Turnaround", + "Twntyfr": "Twentyfour", + "Ttfr": "Twentyfour", + "US": "United States", + "UCITS": "Ucits", + "Ultr": "Ultra", + "Ulysses LT Funds": "Ulysses - L.T. Funds", + "Unbnd": "Unbundled", + "Uncons": "Unconstrained", + "Uncrltd": "Uncorrelated", + "Unhdgd": "Unhedged", + "UnH": "Unhedged", + "Unvsl": "Universal", + "Univ": "University", + "Unrstd": "Unrestricted", + "Unr": "Unrestricted", + "Upstm": "Upstream", + "USD": "USD", + "$": "USD", + "Utilts": "Utilities", + "Util": "Utility", + "Val": "Value", + "Valinvt": "Valueinvest", + "Var": "Variance", + "vhcl": "Vehicle", + "Active": "Vehicle", + "Vol": "Volatility", + "Volatil": "Volatility", + "Vontobel": "Vontobel Fund", + "Vyvazn": "Vyvazene", + "Wtr": "Water", + "Wlth": "Wealth", + "Wpns": "Weapons", + "Wkly": "Weekly", + "Wghd": "Weighed", + "Wtd": "Weighted", + "Wellington II SICAV": "Wellington Management Funds (Luxembourg) II SICAV", + "Wrld": "Wereld", + "Wstn": "Western", + "Wstfld": "Westfield", + "Wholsl": "Wholesale", + "Wnrs": "Winners", + "Wldwd": "Wldwd", + "WW": "Wldwd", + "Wld": "World", + "Yr": "Year", + "Yld": "Yield", + "Y": "Yield", + "Zr": "Zero", + "PLN": "Zloty" +} \ No newline at end of file diff --git a/core/auz_nz/hybrid_solution_script.py b/core/auz_nz/hybrid_solution_script.py index d97b588..514a027 100644 --- a/core/auz_nz/hybrid_solution_script.py +++ b/core/auz_nz/hybrid_solution_script.py @@ -32,24 +32,24 @@ from openai import AzureOpenAI ABB_JSON = dict() -def get_abb_json(): +def get_abb_json(doc_source: str = "aus_prospectus"): global ABB_JSON if len(ABB_JSON.keys()) == 0: - with open("./configuration/aus_prospectus/abbreviation_records.json", "r") as file: + with open(f"./configuration/{doc_source}/abbreviation_records.json", "r") as file: # Load the JSON and convert keys to lowercase ABB_JSON = {key.lower(): value for key, value in json.load(file).items()} -def get_abbre_format_str(fundname): +def get_abbre_format_str(fundname, doc_source: str = "aus_prospectus"): """Replaces abbreviations in a fund name with their expanded forms.""" # Convert fund name to lowercase while matching f_list = fundname.lower().split() - get_abb_json() + get_abb_json(doc_source) updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list] return " ".join(updated_doc_fname_words) -def replace_abbrevs_in_fundnames(fund_names_list): +def replace_abbrevs_in_fundnames(fund_names_list, doc_source: str = "aus_prospectus"): """Replaces abbreviations in a list of fund names.""" - return [get_abbre_format_str(fund_name) for fund_name in fund_names_list] + return [get_abbre_format_str(fund_name, doc_source) for fund_name in fund_names_list] ### STEP 2 - Remove Stopwords @@ -440,7 +440,7 @@ def format_response(doc_id, pred_fund, db_fund, clean_pred_name, clean_db_name, return dt -def final_function_to_match(doc_id, pred_list, db_list, provider_name): +def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_source: str = "aus_prospectus"): final_result = {} df_data = [] unmatched_pred_list = pred_list.copy() @@ -466,8 +466,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name): # unmatched_pred_list.remove(pred_list[index]) else: ### STEP-1 Abbreviation replacement - cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund])[0] - cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list) + cleaned_pred_name1 = replace_abbrevs_in_fundnames([pred_fund], doc_source)[0] + cleaned_db_list1 = replace_abbrevs_in_fundnames(db_list, doc_source) # print("--> ",cleaned_db_list1, cleaned_pred_name1) step1_result, matched_index, all_scores1_, all_matched_fund_names1_ = get_fund_match_final_score(cleaned_db_list1, cleaned_pred_name1) # print(f"\nStep 1 - Abbreviation Replacement Result: {step1_result}") @@ -617,11 +617,11 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name): # print("==>>> DB LIST: ",unmatched_db_list) # print("==>>> PRED LIST: ",unmatched_pred_list) if len(unmatched_pred_list)!=0: - cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list) + cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list, doc_source) cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list) cleaned_unmatched_pred_list = remove_special_characters(cleaned_unmatched_pred_list) - cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list) + cleaned_unmatched_db_list = replace_abbrevs_in_fundnames(unmatched_db_list, doc_source) cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list) cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list) prompt_context = f""" diff --git a/core/data_mapping.py b/core/data_mapping.py index 4218b5f..8578c1c 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -1,6 +1,7 @@ import os import json import pandas as pd +from copy import deepcopy from utils.biz_utils import get_most_similar_name, remove_common_word from utils.sql_query_util import ( query_document_fund_mapping, @@ -18,14 +19,18 @@ class DataMapping: raw_document_data_list: list, document_mapping_info_df: pd.DataFrame, output_data_folder: str, - doc_source: str = "emea_ar" + doc_source: str = "emea_ar", + compare_with_provider: bool = True ): self.doc_id = doc_id self.datapoints = datapoints self.doc_source = doc_source + self.compare_with_provider = compare_with_provider self.raw_document_data_list = raw_document_data_list if document_mapping_info_df is None or len(document_mapping_info_df) == 0: - self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) + self.document_mapping_info_df = query_document_fund_mapping( + doc_id, rerun=False + ) else: self.document_mapping_info_df = document_mapping_info_df @@ -44,7 +49,9 @@ class DataMapping: def set_mapping_data_by_db(self, document_mapping_info_df: pd.DataFrame): logger.info("Setting document mapping data") if document_mapping_info_df is None or len(document_mapping_info_df) == 0: - self.document_mapping_info_df = query_document_fund_mapping(self.doc_id, rerun=False) + self.document_mapping_info_df = query_document_fund_mapping( + self.doc_id, rerun=False + ) else: self.document_mapping_info_df = document_mapping_info_df if len(self.document_mapping_info_df) == 0: @@ -92,26 +99,27 @@ class DataMapping: def get_provider_mapping(self): if len(self.document_mapping_info_df) == 0: return pd.DataFrame() - provider_id_list = ( - self.document_mapping_info_df["ProviderId"].unique().tolist() - ) + provider_id_list = self.document_mapping_info_df["ProviderId"].unique().tolist() provider_mapping_list = [] for provider_id in provider_id_list: - provider_mapping_list.append(query_investment_by_provider(provider_id, rerun=False)) + provider_mapping_list.append( + query_investment_by_provider(provider_id, rerun=False) + ) provider_mapping_df = pd.concat(provider_mapping_list) provider_mapping_df = provider_mapping_df.drop_duplicates() provider_mapping_df.reset_index(drop=True, inplace=True) return provider_mapping_df - + def mapping_raw_data_entrance(self): - if self.doc_source == "emear_ar": + if self.doc_source == "emea_ar": return self.mapping_raw_data() elif self.doc_source == "aus_prospectus": - return self.mapping_raw_data_aus() + return self.mapping_raw_data_generic() else: return self.mapping_raw_data() - - def mapping_raw_data_aus(self): + # return self.mapping_raw_data_generic() + + def mapping_raw_data_generic(self): logger.info(f"Mapping raw data for AUS Prospectus document {self.doc_id}") mapped_data_list = [] # Generate raw name based on fund name and share name by integrate_share_name @@ -128,7 +136,9 @@ class DataMapping: raw_share_name = raw_data.get("share_name", "") raw_data_keys = list(raw_data.keys()) if len(raw_share_name) > 0: - integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name) + integrated_share_name = self.integrate_share_name( + raw_fund_name, raw_share_name + ) if integrated_share_name not in share_raw_name_list: share_raw_name_list.append(integrated_share_name) for datapoint in self.datapoints: @@ -144,7 +154,7 @@ class DataMapping: "investment_type": 1, "investment_id": "", "investment_name": "", - "similarity": 0 + "similarity": 0, } mapped_data_list.append(mapped_data) else: @@ -162,29 +172,38 @@ class DataMapping: "value": raw_data[datapoint], "investment_type": 33, "investment_id": "", - "investment_name": "" + "investment_name": "", } mapped_data_list.append(mapped_data) # Mapping raw data with database - iter_count = 30 + iter_count = 60 fund_match_result = {} if len(fund_raw_name_list) > 0: - fund_match_result = self.get_raw_name_db_match_result(fund_raw_name_list, "fund", iter_count) - logger.info(f"Fund match result: \n{fund_match_result}") + fund_match_result = self.get_raw_name_db_match_result( + fund_raw_name_list, "fund", iter_count + ) + # logger.info(f"Fund match result: \n{fund_match_result}") share_match_result = {} if len(share_raw_name_list) > 0: - share_match_result = self.get_raw_name_db_match_result(share_raw_name_list, "share", iter_count) - logger.info(f"Share match result: \n{share_match_result}") - + share_match_result = self.get_raw_name_db_match_result( + share_raw_name_list, "share", iter_count + ) + # logger.info(f"Share match result: \n{share_match_result}") + for mapped_data in mapped_data_list: investment_type = mapped_data["investment_type"] raw_name = mapped_data["raw_name"] if investment_type == 33: if fund_match_result.get(raw_name) is not None: matched_db_fund_name = fund_match_result[raw_name] - if matched_db_fund_name is not None and len(matched_db_fund_name) > 0: + if ( + matched_db_fund_name is not None + and len(matched_db_fund_name) > 0 + ): # get FundId from self.doc_fund_mapping - find_fund_df = self.doc_fund_mapping[self.doc_fund_mapping["FundName"] == matched_db_fund_name] + find_fund_df = self.doc_fund_mapping[ + self.doc_fund_mapping["FundName"] == matched_db_fund_name + ] if find_fund_df is not None and len(find_fund_df) > 0: fund_id = find_fund_df["FundId"].values[0] mapped_data["investment_id"] = fund_id @@ -193,38 +212,82 @@ class DataMapping: if investment_type == 1: if share_match_result.get(raw_name) is not None: matched_db_share_name = share_match_result[raw_name] - if matched_db_share_name is not None and len(matched_db_share_name) > 0: + if ( + matched_db_share_name is not None + and len(matched_db_share_name) > 0 + ): # get SecId from self.doc_fund_class_mapping - find_share_df = self.doc_fund_class_mapping[self.doc_fund_class_mapping["ShareClassName"] == matched_db_share_name] + find_share_df = self.doc_fund_class_mapping[ + self.doc_fund_class_mapping["ShareClassName"] + == matched_db_share_name + ] if find_share_df is not None and len(find_share_df) > 0: share_id = find_share_df["SecId"].values[0] mapped_data["investment_id"] = share_id mapped_data["investment_name"] = matched_db_share_name mapped_data["similarity"] = 1 - + self.output_mapping_file(mapped_data_list) return mapped_data_list - - def get_raw_name_db_match_result(self, raw_name_list, investment_type: str, iter_count: int = 30): + + def get_raw_name_db_match_result( + self, raw_name_list, investment_type: str, iter_count: int = 30 + ): # split raw_name_list into several parts which each part is with 30 elements # The reason to split is to avoid invoke token limitation issues from CahtGPT - raw_name_list_parts = [raw_name_list[i:i + iter_count] - for i in range(0, len(raw_name_list), iter_count)] + raw_name_list_parts = [ + raw_name_list[i : i + iter_count] + for i in range(0, len(raw_name_list), iter_count) + ] all_match_result = {} + doc_fund_name_list = deepcopy(self.doc_fund_name_list) + doc_share_name_list = deepcopy(self.doc_share_name_list) for raw_name_list in raw_name_list_parts: if investment_type == "fund": - match_result = final_function_to_match(doc_id=self.doc_id, - pred_list=raw_name_list, - db_list=self.doc_fund_name_list, - provider_name=self.provider_name) + match_result, doc_fund_name_list = self.get_final_function_to_match( + raw_name_list, doc_fund_name_list + ) else: - match_result = final_function_to_match(doc_id=self.doc_id, - pred_list=raw_name_list, - db_list=self.doc_share_name_list, - provider_name=self.provider_name) + match_result, doc_share_name_list = self.get_final_function_to_match( + raw_name_list, doc_share_name_list + ) all_match_result.update(match_result) return all_match_result + + def get_final_function_to_match(self, raw_name_list, db_name_list): + if len(db_name_list) == 0: + match_result = {} + for raw_name in raw_name_list: + match_result[raw_name] = "" + else: + match_result = final_function_to_match( + doc_id=self.doc_id, + pred_list=raw_name_list, + db_list=db_name_list, + provider_name=self.provider_name, + doc_source=self.doc_source + ) + matched_name_list = list(match_result.values()) + db_name_list = self.remove_matched_names(db_name_list, matched_name_list) + return match_result, db_name_list + + def remove_matched_names(self, target_name_list: list, matched_name_list: list): + if len(matched_name_list) == 0: + return target_name_list + matched_name_list = list(set(matched_name_list)) + matched_name_list = [ + value for value in matched_name_list if value is not None and len(value) > 0 + ] + for matched_name in matched_name_list: + if ( + matched_name is not None + and len(matched_name) > 0 + and matched_name in target_name_list + ): + target_name_list.remove(matched_name) + return target_name_list + def mapping_raw_data(self): """ doc_id, page_index, datapoint, value, @@ -245,9 +308,14 @@ class DataMapping: if raw_fund_name is None or len(raw_fund_name) == 0: continue raw_share_name = raw_data.get("share_name", "") - if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0: + if ( + len(self.doc_fund_name_list) == 0 + and len(self.provider_fund_name_list) == 0 + ): if len(raw_share_name) > 0: - integrated_share_name = self.integrate_share_name(raw_fund_name, raw_share_name) + integrated_share_name = self.integrate_share_name( + raw_fund_name, raw_share_name + ) raw_data_keys = list(raw_data.keys()) for datapoint in self.datapoints: if datapoint in raw_data_keys: @@ -262,7 +330,7 @@ class DataMapping: "investment_type": 1, "investment_id": "", "investment_name": "", - "similarity": 0 + "similarity": 0, } mapped_data_list.append(mapped_data) else: @@ -279,13 +347,15 @@ class DataMapping: "value": raw_data[datapoint], "investment_type": 33, "investment_id": "", - "investment_name": "" + "investment_name": "", } mapped_data_list.append(mapped_data) else: raw_name = "" if raw_share_name is not None and len(raw_share_name) > 0: - raw_name = self.integrate_share_name(raw_fund_name, raw_share_name) + raw_name = self.integrate_share_name( + raw_fund_name, raw_share_name + ) if mapped_share_cache.get(raw_name) is not None: investment_info = mapped_share_cache[raw_name] else: @@ -298,14 +368,20 @@ class DataMapping: ) fund_id = fund_info["id"] mapped_fund_cache[raw_fund_name] = fund_info - investment_info = self.matching_with_database( - raw_name=raw_name, - raw_share_name=raw_share_name, - raw_fund_name=raw_fund_name, - parent_id=fund_id, - matching_type="share", - process_cache=process_cache - ) + investment_info = {} + if len(fund_id) > 0: + investment_info = self.mapping_unique_raw_data(fund_id=fund_id, + raw_fund_name=raw_fund_name, + raw_data_list=raw_data_list) + if investment_info.get("id", None) is None or len(investment_info.get("id", "")) == 0: + investment_info = self.matching_with_database( + raw_name=raw_name, + raw_share_name=raw_share_name, + raw_fund_name=raw_fund_name, + parent_id=fund_id, + matching_type="share", + process_cache=process_cache, + ) mapped_share_cache[raw_name] = investment_info elif raw_fund_name is not None and len(raw_fund_name) > 0: raw_name = raw_fund_name @@ -322,7 +398,7 @@ class DataMapping: "id": "", "legal_name": "", "investment_type": -1, - "similarity": 0 + "similarity": 0, } raw_data_keys = list(raw_data.keys()) @@ -339,13 +415,35 @@ class DataMapping: "investment_type": investment_info["investment_type"], "investment_id": investment_info["id"], "investment_name": investment_info["legal_name"], - "similarity": investment_info["similarity"] + "similarity": investment_info["similarity"], } mapped_data_list.append(mapped_data) - + self.output_mapping_file(mapped_data_list) return mapped_data_list + def mapping_unique_raw_data(self, fund_id: str, raw_fund_name: str, raw_data_list: list): + share_count = 0 + for raw_data in raw_data_list: + fund_name = raw_data.get("fund_name", "") + share_name = raw_data.get("share_name", "") + if fund_name == raw_fund_name and share_name is not None and len(share_name) > 0: + share_count += 1 + if share_count > 1: + break + data_info = {} + if share_count == 1: + doc_compare_mapping = self.doc_fund_class_mapping[ + self.doc_fund_class_mapping["FundId"] == fund_id + ] + if len(doc_compare_mapping) == 1: + data_info["id"] = doc_compare_mapping["SecId"].values[0] + data_info["legal_name"] = doc_compare_mapping["ShareClassName"].values[0] + data_info["investment_type"] = 1 + data_info["similarity"] = 1 + return data_info + + def output_mapping_file(self, mapped_data_list: list): json_data_file = os.path.join( self.output_data_json_folder, f"{self.doc_id}.json" @@ -355,10 +453,10 @@ class DataMapping: extract_data_df = pd.DataFrame(self.raw_document_data_list) extract_data_df.reset_index(drop=True, inplace=True) - + mapping_data_df = pd.DataFrame(mapped_data_list) mapping_data_df.reset_index(drop=True, inplace=True) - + excel_data_file = os.path.join( self.output_data_excel_folder, f"{self.doc_id}.xlsx" ) @@ -373,7 +471,7 @@ class DataMapping: raw_name = "" if raw_share_name is not None and len(raw_share_name) > 0: raw_name = raw_share_name - # some share names are very short, + # some share names are very short, # so we need to combine with fund name raw_name_splits = raw_name.split() raw_fund_name_splits = raw_fund_name.split() @@ -384,13 +482,13 @@ class DataMapping: return raw_name def matching_with_database( - self, - raw_name: str, - raw_share_name: str = None, + self, + raw_name: str, + raw_share_name: str = None, raw_fund_name: str = None, - parent_id: str = None, + parent_id: str = None, matching_type: str = "fund", - process_cache: dict = {} + process_cache: dict = {}, ): if len(self.doc_fund_name_list) == 0 and len(self.provider_fund_name_list) == 0: data_info["id"] = "" @@ -402,7 +500,7 @@ class DataMapping: data_info["investment_type"] = investment_type data_info["similarity"] = 0 return data_info - + if matching_type == "fund": doc_compare_name_list = self.doc_fund_name_list doc_compare_mapping = self.doc_fund_mapping @@ -417,8 +515,9 @@ class DataMapping: doc_compare_mapping = self.doc_fund_class_mapping[ self.doc_fund_class_mapping["FundId"] == parent_id ] - provider_compare_mapping = self.provider_fund_class_mapping\ - [self.provider_fund_class_mapping["FundId"] == parent_id] + provider_compare_mapping = self.provider_fund_class_mapping[ + self.provider_fund_class_mapping["FundId"] == parent_id + ] if len(doc_compare_mapping) == 0: if len(provider_compare_mapping) == 0: doc_compare_name_list = self.doc_share_name_list @@ -435,9 +534,10 @@ class DataMapping: doc_compare_name_list = ( doc_compare_mapping["ShareClassName"].unique().tolist() ) - - if len(provider_compare_mapping) == 0 or \ - len(provider_compare_mapping) < len(doc_compare_mapping): + + if len(provider_compare_mapping) == 0 or len( + provider_compare_mapping + ) < len(doc_compare_mapping): provider_compare_name_list = doc_compare_name_list provider_compare_mapping = doc_compare_mapping else: @@ -459,58 +559,68 @@ class DataMapping: if doc_compare_name_list is not None and len(doc_compare_name_list) > 0: _, pre_common_word_list = remove_common_word(doc_compare_name_list) max_similarity_name, max_similarity = get_most_similar_name( - raw_name, - doc_compare_name_list, - share_name=raw_share_name, + raw_name, + doc_compare_name_list, + share_name=raw_share_name, fund_name=raw_fund_name, matching_type=matching_type, - process_cache=process_cache) + process_cache=process_cache, + ) if matching_type == "fund": threshold = 0.7 else: - threshold = 0.9 + if self.compare_with_provider: + threshold = 0.9 + else: + threshold = 0.6 if max_similarity is not None and max_similarity >= threshold: data_info["id"] = doc_compare_mapping[ doc_compare_mapping[compare_name_dp] == max_similarity_name ][compare_id_dp].values[0] data_info["legal_name"] = max_similarity_name data_info["similarity"] = max_similarity - + if data_info.get("id", None) is None or len(data_info.get("id", "")) == 0: # set pre_common_word_list, reason: the document mapping for same fund maybe different with provider mapping # the purpose is to get the most common word list, to improve the similarity. - max_similarity_name, max_similarity = get_most_similar_name( - raw_name, - provider_compare_name_list, - share_name=raw_share_name, - fund_name=raw_fund_name, - matching_type=matching_type, - pre_common_word_list=pre_common_word_list, - process_cache=process_cache - ) - threshold = 0.7 - if matching_type == "share": - threshold = 0.5 - round_similarity = 0 - if max_similarity is not None and isinstance(max_similarity, float): - round_similarity = round(max_similarity, 1) - if round_similarity is not None and round_similarity >= threshold: - data_info["id"] = provider_compare_mapping[ - provider_compare_mapping[compare_name_dp] == max_similarity_name - ][compare_id_dp].values[0] - data_info["legal_name"] = max_similarity_name - data_info["similarity"] = max_similarity - else: - if len(doc_compare_name_list) == 1: - data_info["id"] = doc_compare_mapping[ - doc_compare_mapping[compare_name_dp] == doc_compare_name_list[0] + if self.compare_with_provider: + max_similarity_name, max_similarity = get_most_similar_name( + raw_name, + provider_compare_name_list, + share_name=raw_share_name, + fund_name=raw_fund_name, + matching_type=matching_type, + pre_common_word_list=pre_common_word_list, + process_cache=process_cache, + ) + threshold = 0.7 + if matching_type == "share": + threshold = 0.5 + round_similarity = 0 + if max_similarity is not None and isinstance(max_similarity, float): + round_similarity = round(max_similarity, 1) + if round_similarity is not None and round_similarity >= threshold: + data_info["id"] = provider_compare_mapping[ + provider_compare_mapping[compare_name_dp] == max_similarity_name ][compare_id_dp].values[0] - data_info["legal_name"] = doc_compare_name_list[0] - data_info["similarity"] = 1 + data_info["legal_name"] = max_similarity_name + data_info["similarity"] = max_similarity else: - data_info["id"] = "" - data_info["legal_name"] = "" - data_info["similarity"] = 0 + if len(doc_compare_name_list) == 1: + data_info["id"] = doc_compare_mapping[ + doc_compare_mapping[compare_name_dp] + == doc_compare_name_list[0] + ][compare_id_dp].values[0] + data_info["legal_name"] = doc_compare_name_list[0] + data_info["similarity"] = 1 + else: + data_info["id"] = "" + data_info["legal_name"] = "" + data_info["similarity"] = 0 + else: + data_info["id"] = "" + data_info["legal_name"] = "" + data_info["similarity"] = 0 data_info["investment_type"] = investment_type else: data_info["id"] = "" diff --git a/main.py b/main.py index 2052e49..bf07834 100644 --- a/main.py +++ b/main.py @@ -31,11 +31,14 @@ class EMEA_AR_Parsing: output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/", extract_way: str = "text", drilldown_folder: str = r"/data/emea_ar/output/drilldown/", + compare_with_provider: bool = True ) -> None: self.doc_id = doc_id self.doc_source = doc_source self.pdf_folder = pdf_folder os.makedirs(self.pdf_folder, exist_ok=True) + self.compare_with_provider = compare_with_provider + self.pdf_file = self.download_pdf() self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) @@ -72,11 +75,11 @@ class EMEA_AR_Parsing: os.makedirs(self.output_mapping_data_folder, exist_ok=True) self.filter_pages = FilterPages( - self.doc_id, - self.pdf_file, - self.document_mapping_info_df, + self.doc_id, + self.pdf_file, + self.document_mapping_info_df, self.doc_source, - output_pdf_text_folder + output_pdf_text_folder, ) self.page_text_dict = self.filter_pages.page_text_dict @@ -87,7 +90,9 @@ class EMEA_AR_Parsing: drilldown_folder = r"/data/emea_ar/output/drilldown/" os.makedirs(drilldown_folder, exist_ok=True) self.drilldown_folder = drilldown_folder - misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json") + misc_config_file = os.path.join( + f"./configuration/{doc_source}/", "misc_config.json" + ) if os.path.exists(misc_config_file): with open(misc_config_file, "r", encoding="utf-8") as f: misc_config = json.load(f) @@ -278,7 +283,8 @@ class EMEA_AR_Parsing: data_from_gpt, self.document_mapping_info_df, self.output_mapping_data_folder, - self.doc_source + self.doc_source, + compare_with_provider=self.compare_with_provider ) return data_mapping.mapping_raw_data_entrance() @@ -334,6 +340,7 @@ def mapping_data( output_mapping_data_folder=output_mapping_folder, extract_way=extract_way, drilldown_folder=drilldown_folder, + compare_with_provider=False ) doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data( re_run=re_run_extract_data @@ -501,19 +508,29 @@ def batch_start_job( result_extract_data_df.to_excel( writer, index=False, sheet_name="extract_data" ) - - if document_mapping_file is not None and len(document_mapping_file) > 0 and os.path.exists(document_mapping_file): + + if ( + document_mapping_file is not None + and len(document_mapping_file) > 0 + and os.path.exists(document_mapping_file) + ): try: - merged_total_data_folder = os.path.join(output_mapping_total_folder, "merged/") + merged_total_data_folder = os.path.join( + output_mapping_total_folder, "merged/" + ) os.makedirs(merged_total_data_folder, exist_ok=True) data_file_base_name = os.path.basename(output_file) - output_merged_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) - merge_output_data_aus_prospectus(output_file, document_mapping_file, output_merged_data_file_path) + output_merged_data_file_path = os.path.join( + merged_total_data_folder, "merged_" + data_file_base_name + ) + merge_output_data_aus_prospectus( + output_file, document_mapping_file, output_merged_data_file_path + ) except Exception as e: logger.error(f"Error: {e}") if calculate_metrics: - prediction_sheet_name = "total_mapping_data" + prediction_sheet_name = "data_in_doc_mapping" ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" ground_truth_sheet_name = "mapping_data" metrics_output_folder = r"/data/emea_ar/output/metrics/" @@ -770,11 +787,11 @@ def test_auto_generate_instructions(): def test_data_extraction_metrics(): - data_type = "data_extraction" + data_type = "document_mapping_in_db" # prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx" - prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240922152517.xlsx" + prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_51_documents_by_text_20250127104008.xlsx" # prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx" - prediction_sheet_name = "mapping_data" + prediction_sheet_name = "data_in_doc_mapping" ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" ground_truth_sheet_name = "mapping_data" metrics_output_folder = r"/data/emea_ar/output/metrics/" @@ -1017,7 +1034,7 @@ def batch_run_documents( ) re_run_extract_data = False re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_way = "text" @@ -1194,13 +1211,17 @@ def merge_output_data_aus_prospectus( ): # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16 data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data") - document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping") + document_mapping_df = pd.read_excel( + document_mapping_file, sheet_name="document_mapping" + ) # set doc_id to be string type data_df["doc_id"] = data_df["doc_id"].astype(str) document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str) doc_id_list = data_df["doc_id"].unique().tolist() - datapoint_keyword_config_file = r"./configuration/aus_prospectus/datapoint_name.json" + datapoint_keyword_config_file = ( + r"./configuration/aus_prospectus/datapoint_name.json" + ) with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f: datapoint_keyword_config = json.load(f) datapoint_name_list = list(datapoint_keyword_config.keys()) @@ -1212,7 +1233,9 @@ def merge_output_data_aus_prospectus( "EffectiveDate" ].values[0] )[0:10] - share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)] + share_doc_data_df = data_df[ + (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1) + ] exist_raw_name_list = [] for index, row in share_doc_data_df.iterrows(): doc_id = str(row["doc_id"]) @@ -1228,7 +1251,9 @@ def merge_output_data_aus_prospectus( fund_id = "" fund_legal_name = "" if share_class_id != "": - record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id] + record_row = document_mapping_df[ + document_mapping_df["FundClassId"] == share_class_id + ] if len(record_row) > 0: fund_id = record_row["FundId"].values[0] fund_legal_name = record_row["FundLegalName"].values[0] @@ -1265,16 +1290,16 @@ def merge_output_data_aus_prospectus( doc_data_list.append(data) # find data from total_data_list by raw_name for data in doc_data_list: - if ( - data["raw_name"] == raw_name - ): + if data["raw_name"] == raw_name: update_key = datapoint data[update_key] = value if page_index not in data["page_index"]: data["page_index"].append(page_index) break - - fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)] + + fund_doc_data_df = data_df[ + (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33) + ] for index, row in fund_doc_data_df.iterrows(): doc_id = str(row["doc_id"]) page_index = int(row["page_index"]) @@ -1285,12 +1310,13 @@ def merge_output_data_aus_prospectus( value = row["value"] fund_id = row["investment_id"] fund_legal_name = row["investment_name"] - + exist = False if fund_id != "": for data in doc_data_list: - if (fund_id != "" and data["fund_id"] == fund_id) or \ - (data["raw_fund_name"] == raw_fund_name): + if (fund_id != "" and data["fund_id"] == fund_id) or ( + data["raw_fund_name"] == raw_fund_name + ): update_key = datapoint data[update_key] = value if page_index not in data["page_index"]: @@ -1323,6 +1349,7 @@ def merge_output_data_aus_prospectus( if __name__ == "__main__": + # test_data_extraction_metrics() # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx" # document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx" # merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' @@ -1347,10 +1374,12 @@ if __name__ == "__main__": # output_mapping_child_folder=output_mapping_child_folder) # special_doc_id_list = ["553242411"] - - doc_source = "aus_prospectus" + + doc_source = "emea_ar" if doc_source == "aus_prospectus": - document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" + document_sample_file = ( + r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" + ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" @@ -1383,7 +1412,7 @@ if __name__ == "__main__": r"/data/aus_prospectus/output/mapping_data/total/" ) drilldown_folder = r"/data/aus_prospectus/output/drilldown/" - + batch_run_documents( doc_source=doc_source, special_doc_id_list=special_doc_id_list, @@ -1397,7 +1426,61 @@ if __name__ == "__main__": drilldown_folder=drilldown_folder, ) elif doc_source == "emea_ar": - special_doc_id_list = ["553242408"] + special_doc_id_list = [ + "292989214", + "316237292", + "321733631", + "323390570", + "327956364", + "333207452", + "334718372", + "344636875", + "362246081", + "366179419", + "380945052", + "382366116", + "387202452", + "389171486", + "391456740", + "391736837", + "394778487", + "401684600", + "402113224", + "402181770", + "402397014", + "405803396", + "445102363", + "445256897", + "448265376", + "449555622", + "449623976", + "458291624", + "458359181", + "463081566", + "469138353", + "471641628", + "476492237", + "478585901", + "478586066", + "479042264", + "479793787", + "481475385", + "483617247", + "486378555", + "486383912", + "492121213", + "497497599", + "502693599", + "502821436", + "503194284", + "506559375", + "507967525", + "508854243", + "509845549", + "520879048", + "529925114", + ] + special_doc_id_list = ["471641628"] batch_run_documents( doc_source=doc_source, special_doc_id_list=special_doc_id_list )