initial

2024-08-19 09:52:13 -05:00 · 2024-08-19 09:52:13 -05:00 · 424c30853c
commit 424c30853c
10 changed files with 1643 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 /.env
 /log
 /utils/__pycache__
 /__pycache__/*.pyc
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,7 @@
 Flask==3.0.3
 flasgger==0.9.7.1
 PyMuPDF==1.24.4
 python-dotenv==1.0.1
 boto3==1.34.106
 tqdm==4.66.4
 openai==1.35.10
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/gpt_utils.py
+++ b/utils/gpt_utils.py
@ -0,0 +1,121 @@
 # from transformers import GPT2TokenizerFast
 import tiktoken
 from openai import AzureOpenAI
 import openai
 import os
 from time import sleep
 import dotenv
 # loads .env file with your OPENAI_API_KEY
 dotenv.load_dotenv()
 def set_environment_variables(engine=os.getenv("Engine_0613_16k")):
    if engine.startswith('gpt4') or engine.startswith('gpt-4'):
        openai.api_base = os.getenv("OPENAI_API_BASE_DC")
        openai.api_key = os.getenv("OPENAI_API_KEY_GPT4")
    elif engine.startswith('modc-stg-gpt4'):
        openai.api_base = os.getenv("OPENAI_API_BASE_GPT4_MODC")
        openai.api_key = os.getenv("OPENAI_API_KEY_GPT4_MODC")
    elif engine.upper() == 'ENGINE_GPT4_TURBO':
        openai.api_base = os.getenv("OPENAI_API_BASE_GPT4_TURBO")
        openai.api_key = os.getenv("OPENAI_API_KEY_GPT4_TURBO")
    elif engine.startswith('modc-stg-gpt35turbo16k'):
        openai.api_base = os.getenv("OPENAI_API_BASE_GPT3_MODC")
        openai.api_key = os.getenv("OPENAI_API_KEY_GPT3_MODC")
    else:
        openai.api_base = os.getenv("OPENAI_API_BASE")
        openai.api_key = os.getenv("OPENAI_API_KEY")
    openai.Engine = engine
    openai.api_type = os.getenv("OPENAI_API_TYPE")
    openai.api_version = os.getenv("OPENAI_API_VERSION")
 # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
 tokenizer = tiktoken.get_encoding("cl100k_base")
 def get_embedding(text, engine=os.getenv("EMBEDDING_ENGINE")):
    count = 0
    error = ''
    while count < 5:
        try:
            if count > 0:
                print(f'retrying the {count} time for getting text embedding...')
            return openai.Embedding.create(input=text, engine=engine)['data'][0]['embedding']
        except Exception as e:
            error = str(e)
            print(error)
            count += 1
            sleep(1)
 def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(tokenizer.encode(string))
    return num_tokens
 def num_tokens_from_messages(messages, model="gpt-35-turbo-16k"):
    """Returns the number of tokens used by a list of messages."""
    encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-35-turbo-16k":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-32k":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        tokens_per_message = 3
        tokens_per_name = 1
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens
 def chat(prompt: str, 
         engine = os.getenv("Engine_GPT4o"), 
         azure_endpoint=os.getenv("OPENAI_API_BASE_GPT4o"),
         api_key=os.getenv("OPENAI_API_KEY_GPT4o"),
         api_version=os.getenv("OPENAI_API_VERSION_GPT4o"),
         temperature: float = 0.0):
    client = AzureOpenAI(
        azure_endpoint=azure_endpoint,
        api_key=api_key,
        api_version=api_version
    )
    count = 0
    error = ''
    max_tokens = 4000
    request_timeout = 120
    while count < 8:
        try:
            if count > 0:
                print(f'retrying the {count} time...')
            response  = client.chat.completions.create(
                    model=engine,
                    temperature=temperature,
                    max_tokens=max_tokens,
                    top_p=0.95,
                    frequency_penalty=0,
                    presence_penalty=0,
                    timeout=request_timeout,
                    stop=None,
                    messages=[
                        {"role": "user", "content": prompt}
                    ]
                )
            return response.choices[0].message.content, False
        except Exception as e:
            error = str(e)
            print(f"error message: {error}")
            if 'maximum context length' in error:
                return error, True
            count += 1
        sleep(3)
    return error, True
--- a/utils/logger.py
+++ b/utils/logger.py
@ -0,0 +1,44 @@
 import logging
 import time
 from logging.handlers import TimedRotatingFileHandler
 import os
 class Logger:
    def __init__(self):
        # log file folder
        output_folder = r'./log/'
        os.makedirs(output_folder, exist_ok=True)
        # add self._log_filename to be ar_yyyyMMddHHmm.log
        self._log_filename = os.path.join(output_folder, 'ar_{}.log'.format(time.strftime("%Y%m%d%H%M%S", time.localtime())))
        logging.basicConfig()
        # log format
        self._formatter = logging.Formatter('%(asctime)s - %(process)d - %(levelname)s: %(message)s',
                                            datefmt='%Y-%m-%d %H:%M:%S')
        self._logger = logging.getLogger()
        # self.set_console_logger()
        self.set_file_logger()
        self._logger.setLevel(logging.INFO)
    def set_console_logger(self):
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(self._formatter)
        console_handler.setLevel(logging.INFO)
        self._logger.addHandler(console_handler)
    def set_file_logger(self):
        log_file_handler = TimedRotatingFileHandler(filename=self._log_filename,
                                                    when="D",
                                                    interval=1,
                                                    backupCount=3,
                                                    encoding='utf-8')
        log_file_handler.setFormatter(self._formatter)
        log_file_handler.setLevel(logging.INFO)
        # log_file_handler.suffix = "%Y%m%d_%H%M%S.log"
        self._logger.addHandler(log_file_handler)
    def get_logger(self):
        return self._logger
 logger = Logger().get_logger()
--- a/utils/pdf_download.py
+++ b/utils/pdf_download.py
@ -0,0 +1,56 @@
 import boto3  
 import time
 import os
 from utils.logger import logger
 import dotenv
 # loads .env file with your OPENAI_API_KEY
 dotenv.load_dotenv()
 def try_ntimes(func, params, success_msg='', error_msg='', error_res=False, ntimes=1, interval=1):  
    count = 1  
    while True:  
        try:  
            res = func(**params)  
            print(success_msg)  
            return res  
        except Exception as e:  
            if count == ntimes:  
                print(error_msg)  
                return error_res  
            print(f'Please Set AWS environment variables at first, error: {e}')  
            print("Having tried {} times and trying one more time...".format(count))  
            time.sleep(interval)  
        count += 1  
 def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
    if pdf_directory is None or pdf_directory == "":
        logger.error("pdf_directory is not provided")
        return None
    os.makedirs(pdf_directory, exist_ok=True)
    pdf_file_path = os.path.join(pdf_directory, f"{doc_id}.pdf")  
    if os.path.exists(pdf_file_path):  
        logger.info(f"PDF file for {os.path.basename(pdf_file_path)} already exists. Skipping...")
        return pdf_file_path
    else:
        ACCESS_KEY = os.getenv('ACCESS_KEY')
        SECRET_KEY = os.getenv('SECRET_KEY')  
        session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)  
        s3 = session.client('s3')
        # s3 = boto3.client('s3')
        bucket_name = os.getenv('BUCKET_NAME')
        params = {'Bucket': bucket_name, 'Key': doc_id, 'Filename': pdf_file_path}  
        success_msg = f'file downloaded from S3 successfully: {doc_id}'  
        error_msg = f'failed to download file {doc_id} from S3'  
        error_res = '__process_failed__'  
        res = try_ntimes(func=s3.download_file, params=params,  
                        success_msg=success_msg,  
                        error_msg=error_msg, error_res=error_res,  
                        ntimes=3, interval=5)  
        return pdf_file_path  
--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
--- a/utils/s3_util.py
+++ b/utils/s3_util.py
@ -0,0 +1,40 @@
 """ upload one directory from the current working directory to aws """
 from pathlib import Path
 import os
 import glob
 import boto3
 def upload_dir(local_dir, aws_init_dir, bucket_name, tag, prefix='/'):
    """
    from current working directory, upload a 'localDir' with all its subcontents (files and subdirectories...)
    to a aws bucket
    Parameters
    ----------
    local_dir :   localDirectory to be uploaded, with respect to current working directory
    aws_init_dir : prefix 'directory' in aws
    bucket_name : bucket in aws
    tag :        tag to select files, like *png
                 NOTE: if you use tag it must be given like --tag '*txt', in some quotation marks... for argparse
    prefix :     to remove initial '/' from file names
    Returns
    -------
    None
    """
    s3 = boto3.resource('s3')
    cwd = str(Path.cwd())
    p = Path(os.path.join(Path.cwd(), local_dir))
    mydirs = list(p.glob('**'))
    for mydir in mydirs:
        file_names = glob.glob(os.path.join(mydir, tag))
        file_names = [f for f in file_names if not Path(f).is_dir()]
        rows = len(file_names)
        for i, file_name in enumerate(file_names):
            # file_name = str(file_name).replace(cwd, '')
            s3_file_name = ""
            if file_name.startswith(prefix):  # only modify the text if it starts with the prefix
                s3_file_name = file_name.replace(prefix, "", 1) # remove one instance of prefix
            print(f"fileName {file_name}")
            if len(s3_file_name) > 0:
                s3_path = os.path.join(aws_init_dir, str(s3_file_name))
                s3.meta.client.upload_file(file_name, bucket_name, s3_path)
--- a/utils/similarity.py
+++ b/utils/similarity.py
@ -0,0 +1,165 @@
 """
@version: 0.1
@author: Blade He
@license: Morningstar 
@contact: blade.he@morningstar.com
@site: 
@software: PyCharm
@file: Similarity.py
@time: 2019/03/20
 """
 from math import *
 from decimal import Decimal
 import math
 import re
 class Similarity:
    """ Five similarity measures function """
    def euclidean_distance(self, x, y):
        """ return euclidean distance between two lists """
        return sqrt(sum(pow(a - b, 2) for a, b in zip(x, y)))
    def manhattan_distance(self, x, y):
        """ return manhattan distance between two lists """
        return sum(abs(a - b) for a, b in zip(x, y))
    def minkowski_distance(self, x, y, p_value):
        """ return minkowski distance between two lists """
        return self.nth_root(sum(pow(abs(a - b), p_value) for a, b in zip(x, y)), p_value)
    def nth_root(self, value, n_root):
        """ returns the n_root of an value """
        root_value = 1 / float(n_root)
        return round(Decimal(value) ** Decimal(root_value), 3)
    def cosine_similarity(self, x, y):
        """ return cosine similarity between two lists """
        numerator = sum(a * b for a, b in zip(x, y))
        denominator = self.square_rooted(x) * self.square_rooted(y)
        return round(numerator / float(denominator), 3)
    def square_rooted(self, x):
        """ return 3 rounded square rooted value """
        return round(sqrt(sum([a * a for a in x])), 3)
    def jaccard_similarity(self, x: list, y: list):
        """ returns the jaccard similarity between two lists """
        intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
        union_cardinality = len(set.union(*[set(x), set(y)]))
        if union_cardinality == 0:
            return 0
        return intersection_cardinality / float(union_cardinality)
    def y_in_x_similarity(self, x: list, y: list):
        """ returns the jaccard similarity between two lists """
        intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
        len_y = len(set(y))
        if len_y == 0:
            return 0
        return intersection_cardinality / float(len_y)
    def compare_text_in_text_list_similarity(self, text: str, compare_text_list: list):
        if text is None or len(text) == 0:
            return 0
        if compare_text_list is None or len(compare_text_list) == 0:
            return 0
        # remove specical case for text
        text = text.lower()
        # Fix issue for matching fund feeder
        # It's the case for the following text:
        # Raw fund name: Schroders Capital UK Real Estate Fund Feeder Trust
        # Fund name list in database:
        # Schroder UK Real Estate Fund Feeder Trust
        # Schroders Capital UK Real Estate Fund
        # The matching should be Schroder UK Real Estate Fund Feeder Trust.
        # But somehow, the matching is Schroders Capital UK Real Estate Fund, 
        # it's incorrect.
        if "feeder" in text.split():
            need_tranform = False
            for compare in compare_text_list:
                if "feeder" in compare.lower().split():
                    need_tranform = True
                    break
            if need_tranform:
                temp_max_similarity = 0
                temp_max_similarity_text = ""
                for compare in compare_text_list:
                    compare = compare.lower()
                    if "feeder" in compare.split(): 
                        similarity = self.y_in_x_similarity(text.split(), compare.split())
                        if similarity > temp_max_similarity:
                            temp_max_similarity = similarity
                            temp_max_similarity_text = compare
                if temp_max_similarity > 0:
                    text = temp_max_similarity_text
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text_split = list(set([word for word in text.split() 
                               if word.lower() not in ["name", "fund", "funds"]]))
        if len(text_split) == 0:
            return 0, ""
        max_similarity = 0
        max_similarity_text = ""
        max_similarity_text_split = []
        for comapare_text in compare_text_list:
            updated_comapare_text = comapare_text.lower()
            updated_comapare_text = re.sub(r'\W', ' ', updated_comapare_text)
            updated_comapare_text = re.sub(r'\s+', ' ', updated_comapare_text)
            comapare_text_split = list(set([word for word in updated_comapare_text.split()
                                           if word.lower() not in ["name", "fund", "funds"]]))
            if len(comapare_text_split) == 0:
                continue
            similarity = self.y_in_x_similarity(text_split, comapare_text_split)
            if similarity > 0 and similarity == max_similarity:
                if len(comapare_text_split) > len(max_similarity_text_split):
                    max_similarity_text = comapare_text
                    max_similarity_text_split = comapare_text_split
            if similarity > max_similarity:
                max_similarity = similarity
                max_similarity_text = comapare_text
                max_similarity_text_split = comapare_text_split
        return max_similarity, max_similarity_text
    def edit_distance_similarity(self, left: str, right: str):
        m, n = len(left) + 1, len(right) + 1
        # create a matrix (m*n)
        matrix = [[0] * n for i in range(m)]
        matrix[0][0] = 0
        for i in range(1, m):
            matrix[i][0] = matrix[i - 1][0] + 1
        for j in range(1, n):
            matrix[0][j] = matrix[0][j - 1] + 1
        # for i in range(m):
        #     print(matrix[i])
        #
        # print()
        "********************"
        for i in range(1, m):
            for j in range(1, n):
                if left[i - 1] == right[j - 1]:
                    cost = 0
                else:
                    cost = 1
                matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost)
        # for i in range(m):
        #     print(matrix[i])
        distance = matrix[m - 1][n - 1]
        return 1 - distance / max(len(left), len(right))
--- a/utils/sys_util.py
+++ b/utils/sys_util.py
@ -0,0 +1,16 @@
 import os
 import boto3
 from ec2_metadata import ec2_metadata
 def stop_instance():
    try:
        ec2_path = r"/home/ec2-user"
        if os.path.exists(ec2_path):
            current_ec2_id = ec2_metadata.instance_id
            region = ec2_metadata.region
            ec2 = boto3.client('ec2', region_name=region)
            ec2.stop_instances(instance_ids=[current_ec2_id])
    except Exception as e:
        print(e)
        os.system("sudo shutdown now -h")