Upload 3 files
Browse files- chroma_modelY_save.py +24 -0
- chroma_modely.py +163 -0
- requirements.txt +14 -0
chroma_modelY_save.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import Chroma
|
2 |
+
from langchain.chat_models import ChatOpenAI
|
3 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
4 |
+
|
5 |
+
|
6 |
+
import pandas as pd
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
from langchain.schema import Document
|
10 |
+
|
11 |
+
|
12 |
+
os.environ["OPENAI_API_KEY"]='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'
|
13 |
+
|
14 |
+
embeddings = OpenAIEmbeddings()
|
15 |
+
# embeddings = ModelScopeEmbeddings(model_id="xrunda/m3e-base",model_revision="v1.0.4")
|
16 |
+
file_path = 'modelY_transformed_corrected.csv'
|
17 |
+
data = pd.read_csv(file_path)
|
18 |
+
docs=[]
|
19 |
+
for index, row in data.iterrows():
|
20 |
+
page_content = row['page_content']
|
21 |
+
metadata = row['metadata'].replace("'", '"')
|
22 |
+
docs.append(Document(page_content=page_content,metadata=json.loads(metadata)))
|
23 |
+
# vectorstore = Chroma.from_documents(docs, embeddings)
|
24 |
+
Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db_modelY")
|
chroma_modely.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from langchain.schema import Document
|
4 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
5 |
+
from langchain.chat_models import ChatOpenAI
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
from langchain.chains import LLMChain
|
8 |
+
from langchain.llms import Tongyi
|
9 |
+
from langchain.chains.question_answering import load_qa_chain
|
10 |
+
|
11 |
+
from langchain.vectorstores import Chroma
|
12 |
+
import pandas as pd
|
13 |
+
import json
|
14 |
+
|
15 |
+
|
16 |
+
os.environ["DASHSCOPE_API_KEY"] = 'sk-38e455061c004036a70f661a768ba779'
|
17 |
+
DASHSCOPE_API_KEY='sk-38e455061c004036a70f661a768ba779'
|
18 |
+
|
19 |
+
os.environ["OPENAI_API_KEY"]='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'
|
20 |
+
|
21 |
+
embeddings = OpenAIEmbeddings()
|
22 |
+
vectorstore = Chroma(persist_directory="./chroma_db_modelY", embedding_function=embeddings)
|
23 |
+
|
24 |
+
# vectorstore.as_retriever(search_kwargs={'k': 1})
|
25 |
+
# vectorstore.as_retriever(search_type="mmr")
|
26 |
+
vectorstore.as_retriever(search_type="similarity_score_threshold",search_kwargs={'k': 3,'score_threshold': 0.1})
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
# print(len(vectorstore.get(limit=1)))
|
31 |
+
from langchain.llms import OpenAI
|
32 |
+
from langchain.retrievers.self_query.base import SelfQueryRetriever
|
33 |
+
from langchain.chains.query_constructor.base import AttributeInfo
|
34 |
+
|
35 |
+
metadata_field_info = [
|
36 |
+
AttributeInfo(
|
37 |
+
name="brand",
|
38 |
+
description="汽车品牌",
|
39 |
+
type="string",
|
40 |
+
),
|
41 |
+
AttributeInfo(
|
42 |
+
name="model",
|
43 |
+
description="车型",
|
44 |
+
type="string",
|
45 |
+
),
|
46 |
+
AttributeInfo(
|
47 |
+
name="name",
|
48 |
+
description="具体车型名称",
|
49 |
+
type="string",
|
50 |
+
),
|
51 |
+
AttributeInfo(
|
52 |
+
name="year",
|
53 |
+
description="上市年份",
|
54 |
+
type="integer",
|
55 |
+
),
|
56 |
+
AttributeInfo(
|
57 |
+
name="price",
|
58 |
+
description="售价",
|
59 |
+
type="string"
|
60 |
+
)
|
61 |
+
]
|
62 |
+
document_content_description = "汽车车型的用户评价"
|
63 |
+
llm = OpenAI(temperature=0)
|
64 |
+
|
65 |
+
|
66 |
+
retriever = SelfQueryRetriever.from_llm(
|
67 |
+
llm, vectorstore, document_content_description, metadata_field_info, verbose=True,enable_limit=True
|
68 |
+
)
|
69 |
+
|
70 |
+
|
71 |
+
# retriever=SelfQueryRetriever(search_kwargs={"k":3})
|
72 |
+
# retriever.from_llm(llm=llm,vectorstore=vectorstore,document_content_description=document_content_description,metadata_field_info=metadata_field_info,verbose=True,enable_limit=True)
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
#✔️ 增加name属性
|
77 |
+
# print(retriever.get_relevant_documents(query="李光L9,2.0T自动优点"))
|
78 |
+
|
79 |
+
# filter 2.0T自动 丢失
|
80 |
+
# print(retriever.get_relevant_documents(query="我最近考虑买车,目前关注李光L9这款车,请介绍一下2.0T自动优点和缺点"))
|
81 |
+
|
82 |
+
# 这个可以,三个过滤条件
|
83 |
+
# print(retriever.get_relevant_documents(query="请介绍李光L9纯电动,这款车的缺点"))
|
84 |
+
# 四个过滤条件就不行了,目前最多只能三个过滤条件??????结论不扎实,纯电动这个过滤条件丢失了
|
85 |
+
# print(retriever.get_relevant_documents(query="请介绍李光L9纯电动,这款车的缺点"))
|
86 |
+
|
87 |
+
|
88 |
+
# ✔️ 可以找出缺点
|
89 |
+
# print(retriever.get_relevant_documents(query="李光L9的缺点"))
|
90 |
+
|
91 |
+
|
92 |
+
# ✔️ 全部找出来,把优点排前面,缺点排后面
|
93 |
+
# print(retriever.get_relevant_documents(query="丰田卡罗拉优点,2020年上市"))
|
94 |
+
|
95 |
+
# print(retriever.get_relevant_documents(query="驾驶者之车",metadata={"brand": '理想'}))
|
96 |
+
|
97 |
+
# This example only specifies a relevant query
|
98 |
+
# ✔️
|
99 |
+
# print(retriever.get_relevant_documents("大众高尔夫的优点"))
|
100 |
+
# ✔️
|
101 |
+
# print(retriever.get_relevant_documents("2020年之后上市的宝马"))
|
102 |
+
# print(retriever.get_relevant_documents("2015年之后上市的宝马"))
|
103 |
+
|
104 |
+
# 2.检索生成结果
|
105 |
+
def retrieve_info(query):
|
106 |
+
return retriever.get_relevant_documents(query=query)
|
107 |
+
|
108 |
+
# 3.设置LLMChain和提示
|
109 |
+
llm=ChatOpenAI(temperature=0, model='gpt-3.5-turbo-16k-0613')
|
110 |
+
# llm_qwen = Tongyi(model_kwargs={"api_key":DASHSCOPE_API_KEY},model_name= "qwen-7b-chat-v1")
|
111 |
+
template = """
|
112 |
+
你是一名掌握了全部汽车用户真实使用评价内容的智能回复机器人。
|
113 |
+
我将发送给你一位客户关于汽车使用、购买建议、与其他品牌车型对比等方面的问题。
|
114 |
+
客户希望你在真实车主评价的基础上,归纳总结形成一句结论性的内容,发送给这位客户,并遵循以下所有规则。
|
115 |
+
1/ 在篇幅、语气、逻辑论证和其他细节方面,答复应与尽可能的给人专业的感觉,如实客观的表达问题的答案,不要增加你自己的幻觉。
|
116 |
+
2/ 如果在真实车主评价内容中没有这个问题的相关答案,请回答:“很抱歉,基于真实车主的口碑数据,我暂时不能给出您这个问题的答案。“
|
117 |
+
{message}
|
118 |
+
以下是针对这个问题,真实车主评价内容:
|
119 |
+
{best_practice}
|
120 |
+
请为这个客户返回最符合问题的最佳回复内容:
|
121 |
+
|
122 |
+
所有回复均为中文
|
123 |
+
"""
|
124 |
+
prompt=PromptTemplate(
|
125 |
+
input_variables=["message","best_practice"],
|
126 |
+
template=template
|
127 |
+
)
|
128 |
+
|
129 |
+
chain=LLMChain(llm=llm,prompt=prompt)
|
130 |
+
# 4.检索生成结果
|
131 |
+
def generate_response(message):
|
132 |
+
best_practice = retrieve_info(message)
|
133 |
+
print('message:',message)
|
134 |
+
print('向量召回内容Len:',len(best_practice))
|
135 |
+
print('向量召回内容:',best_practice)
|
136 |
+
print('')
|
137 |
+
print('')
|
138 |
+
print('')
|
139 |
+
print('')
|
140 |
+
print('')
|
141 |
+
print('')
|
142 |
+
print('')
|
143 |
+
|
144 |
+
# chain_qw = load_qa_chain(llm=llm_qwen, chain_type="stuff",prompt=prompt)
|
145 |
+
# chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff", prompt=prompt)
|
146 |
+
# response=chain({"input_documents": best_practice, "question": message}, return_only_outputs=True)
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
# response=chain_qw({"input_documents": best_practice, "question": message}, return_only_outputs=True)
|
151 |
+
# response=chain.run(input_documents=best_practice, question=message)
|
152 |
+
response = chain.run(message=message,best_practice=best_practice)
|
153 |
+
return response
|
154 |
+
|
155 |
+
# message='特斯拉ModelY的后备箱可以放下自行车么?'
|
156 |
+
# message='特斯拉ModelY的后备箱可以放下冰箱么?'
|
157 |
+
|
158 |
+
# 很抱歉,基于真实车主的口碑数据,我暂时不能给出您这个问题的答案。
|
159 |
+
# message='特斯拉ModelY四驱能越野么?'
|
160 |
+
|
161 |
+
|
162 |
+
message='特斯拉ModelY和宝马x3对比下性价比如何?'
|
163 |
+
print(generate_response(message))
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
openai
|
3 |
+
tiktoken
|
4 |
+
CSVLoader==0.0.2
|
5 |
+
python-dotenv==1.0.0
|
6 |
+
streamlit==1.18.1
|
7 |
+
sentence_transformers==2.2.2
|
8 |
+
chromadb==0.3.29
|
9 |
+
dashscope
|
10 |
+
lark
|
11 |
+
# openai==0.27.6
|
12 |
+
# tiktoken==0.4.0
|
13 |
+
faiss-cpu==1.7.4
|
14 |
+
altair==4.0
|