File size: 9,605 Bytes
a2ee974
 
 
 
 
5dd704c
51e1c58
a2ee974
 
 
 
 
5dd704c
a2ee974
 
 
 
51e1c58
a2ee974
 
 
 
 
 
 
 
 
 
5dd704c
a2ee974
 
 
 
 
f966467
a2ee974
 
 
 
 
 
 
 
 
 
 
 
5dd704c
 
51e1c58
5dd704c
51e1c58
 
 
 
 
 
566bba1
a2ee974
 
 
 
 
 
 
5dd704c
a2ee974
 
 
 
5dd704c
9db2841
 
 
 
 
 
 
 
 
 
 
 
 
5dd704c
 
 
 
 
 
 
 
 
 
 
 
 
 
9db2841
41f1164
9db2841
 
 
 
5e4d2de
bd5a0eb
7e729ae
bd5a0eb
 
9db2841
 
bd5a0eb
9db2841
 
 
 
5dd704c
 
 
 
 
 
 
 
 
 
 
 
 
 
24add6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2ee974
566bba1
 
 
6f82717
566bba1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2ee974
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# Created by Leandro Carneiro at 19/01/2024
# Description: 
# ------------------------------------------------
#from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_together.embeddings import TogetherEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_together import Together
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import os
import csv
import time

def read_csv_to_dict(filename):
    data_dict = {}
    with open(filename, mode='r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            key, value = row[0].split(';')
            data_dict[key] = value
    return data_dict

def generate_embeddings_and_vectorstore(path, model):
    try:
        loader = DirectoryLoader(path=path, glob="**/*.txt")
        corpus = loader.load()
        print(f'    Total de documentos antes do text_split = {len(corpus)}')

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=400)
        docs = text_splitter.split_documents(corpus)
        num_total_characters = sum([len(x.page_content) for x in docs])
        print(f"    Total de chunks depois do text_split = {len(docs)}")
        print(f"    Média de caracteres por chunk = {num_total_characters / len(docs):,.0f}")

        dict_filename_url = read_csv_to_dict('./local_base/filename_url.csv')
        for doc in docs:
            filename = os.path.basename(doc.metadata["source"])
            doc.metadata["link"] = dict_filename_url.get(filename)

        #print('docs')
        #print(docs)
        if model == 'openai':
            fc_embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_KEY'])
            vectorstore = Chroma.from_documents(docs, fc_embeddings)
        else:
            #fc_embeddings = HuggingFaceEmbeddings(model_name = 'intfloat/multilingual-e5-large-instruct')
            #vectorstore = Chroma.from_documents(docs, fc_embeddings)
            fc_embeddings = TogetherEmbeddings(model = 'togethercomputer/m2-bert-80M-8k-retrieval', together_api_key = os.environ['TOGETHER_KEY'])
            for doc in docs:
                vectorstore = Chroma.from_documents(documents=[doc], embedding=fc_embeddings)
                time.sleep(0.5)
        print('total de docs no vectorstore=',len(vectorstore.get()['documents']))

        return vectorstore
    except Exception as e:
        print(str(e))
        return str(e)

class Rag:
    def __init__(self, vectorstore, min_words, max_words, model):
        self.text = None
        self.vectorstore = vectorstore
        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")

        if model == 'openai':
            prompt_template = """Your task is to create news for a newspaper based on pieces of text delimited by <> and a question delimited by <>.
                        Do not use only your knowledge to make the news. Make the news based on the question, but using the pieces of text.
                        If the pieces of text don't enough information about the question to create the news, just say that you need more sources of information, nothing more.
                        The news should have a title.
                        The news should be written in a formal language.
                        The news should have between {min_words} and {max_words} words and it should be in Portuguese language.
                        The news should be about the following context: <{context}>
                        Question: <{question}>
                        Answer here:"""
            self.prompt = PromptTemplate(template=prompt_template,
                                         input_variables=["context", "question"],
                                         partial_variables={"min_words": min_words, "max_words": max_words})

            self.qa = ConversationalRetrievalChain.from_llm(
                        llm=ChatOpenAI(model_name="gpt-3.5-turbo-0125", #0125 #1106
                                       temperature=0,
                                       openai_api_key=os.environ['OPENAI_KEY'],
                                       max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
                        memory=self.memory,
                        # retriever=vectorstore.as_retriever(search_type='similarity_score_threshold',
                        #                                    search_kwargs={'k':4, 'score_threshold':0.8}), #search_kwargs={'k': 3}
                        retriever=vectorstore.as_retriever(),
                        combine_docs_chain_kwargs={"prompt": self.prompt},
                        chain_type="stuff",#map_reduce, refine, map_rerank
                        return_source_documents=True,
                    )
        else:
            prompt_template = """Your task is to create news for a newspaper based on pieces of text delimited by <> and a question delimited by <>.
                        The news should be written in Portuguese language.
                        Do not use only your knowledge to make the news. Make the news based on the question, but using the pieces of text.
                        If the pieces of text don't enough information about the question to create the news, just say that you need more sources of information, nothing more.
                        The news should have a title.
                        The news should be written in a formal language.
                        The news should have between {min_words} and {max_words} words.
                        The source written in the pieces of text should not be shown in the news.
                        The total of words should no be shown in the news.
                        The news should be written in Portuguese language.
                        Answer the title and the news, once, nothing more.
                        The news should be about the following context: <{context}>
                        Question: <{question}>
                        Answer here: """
            self.prompt = PromptTemplate(template=prompt_template,
                                         input_variables=["context", "question"],
                                         partial_variables={"min_words": min_words, "max_words": max_words})

            self.qa = ConversationalRetrievalChain.from_llm(
                        llm=Together(model="mistralai/Mixtral-8x7B-Instruct-v0.1", #0125 #1106
                                       temperature=0,
                                       together_api_key=os.environ['TOGETHER_KEY'],
                                       max_tokens=int(int(max_words) + (int(max_words) / 2))), #número máximo de tokens para a resposta
                        memory=self.memory,
                        # retriever=vectorstore.as_retriever(search_type='similarity_score_threshold',
                        #                                    search_kwargs={'k':4, 'score_threshold':0.8}), #search_kwargs={'k': 3}
                        retriever=vectorstore.as_retriever(),
                        combine_docs_chain_kwargs={"prompt": self.prompt},
                        chain_type="stuff",#map_reduce, refine, map_rerank
                        return_source_documents=True,
                    )

        # from langchain_together import Together
        # self.qa = ConversationalRetrievalChain.from_llm(
        #     llm=Together(model="mistralai/Mixtral-8x7B-Instruct-v0.1",  # 0125 #1106
        #                    temperature=0,
        #                    #top_k=20,
        #                    together_api_key=os.environ['TOGETHER_KEY'],
        #                    max_tokens=int(int(max_words) + (int(max_words) / 2))),
        #     # número máximo de tokens para a resposta
        #     memory=self.memory,
        #     # retriever=vectorstore.as_retriever(search_type='similarity_score_threshold',
        #     #                                    search_kwargs={'k':4, 'score_threshold':0.8}), #search_kwargs={'k': 3}
        #     retriever=vectorstore.as_retriever(),
        #     combine_docs_chain_kwargs={"prompt": self.prompt},
        #     chain_type="stuff",  # map_reduce, refine, map_rerank
        #     return_source_documents=True,
        # )

        
    def generate_text(self, subject):
        try:
            query = f"Elabore uma nova notícia sobre {subject}."
            result_text = self.qa.invoke({"question": query})
            print('##### result', result_text)

            list_result_sources = []
            str_result_sources = ''
            for doc in result_text["source_documents"]:
                list_result_sources.append(doc.metadata['link'])
            result_sources = list(set(list_result_sources))
            for i in range(len(result_sources)):
                str_result_sources += f'{i + 1}) {result_sources[i]}' + '\n'

            self.vectorstore.delete_collection()

            return (result_text["answer"], str_result_sources)
        except Exception as e:
            self.vectorstore.delete_collection()
            return str(e)