Spaces:

realvest
/

realvest-app

Running

App Files Files Community

neobot commited on Jun 26, 2023

Commit

49df564

1 Parent(s): 6e7c49a

improve summary

Browse files

Files changed (1) hide show

app.py +60 -20

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import streamlit as st
 import openai
 import pinecone
 from postgres_db import query_postgresql_realvest
 PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]
 OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
@@ -11,18 +12,20 @@ EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr
 MAX_LENGTH_DESC = 200
 MATCH_SCORE_THR = 0.0
 TOP_K = 20
-def query_pinecone(xq, top_k: int=3, include_metadata: bool=True, sleep_time: int=10):
     MAX_TRIALS = 5
     trial = 0
     out = None
     while (out is None) and (trial < MAX_TRIALS):
         try:
-            out = st.session_state['index'].query(xq, top_k=top_k, include_metadata=include_metadata)
             return out
         except pinecone.core.exceptions.PineconeProtocolError as err:
             print(f"Error, sleep! {err}")
@@ -96,24 +99,35 @@ def summarize_products(products: list) -> str:
     summary = "{summary of all products}"
     """
     NEW_LINE = '\n'
-    prompt = f"""
-    Based on the product information below, please read and try to understand it.
-    { f"{NEW_LINE*2}---{NEW_LINE*2}".join(products) }
-    Please write a concise and insightful summary table (display as HTML) to compare the products for investors, which should inlcude but not limited to:
-    - description
-    - category
-    - asking price
-    - location
-    - potential profit margin
     """
-    print(f"prompt: {prompt}")
     openai.api_key = OPENAI_API_KEY
     completion = openai.ChatCompletion.create(
         model="gpt-4",
         messages=[
             {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": prompt}
         ]
     )
@@ -166,8 +180,8 @@ if st.button('Search'):
     ### call OpenAI text-embedding
     res = openai.Embedding.create(model=EMBEDDING_MODEL, input=[query], api_key=OPENAI_API_KEY)
     xq = res['data'][0]['embedding']
-    out = query_pinecone(xq, top_k=TOP_K, include_metadata=True)
     if (out is not None) and ('matches' in out):
         metadata = {match['metadata']['product_id']: match['metadata'] for match in out['matches'] if 'metadata' in match and match['metadata'] is not None}
@@ -259,14 +273,40 @@ if st.session_state['count_checked'] > 0:
     with summary_container.container():
         st.header('Summary')
         if st.button('Compare Products'):
             products = []
-            for key in st.session_state['checked_boxes']:
-                # TODO: Need to pull all the document
-                # TODO: Need to dedup the pid too
-                pid = key.split('__')[-1]
                 products.append(
-                    st.session_state['metadata'][pid].get('document')
                 )
             with st.spinner('Summarizing...'):
                 summary = summarize_products(products)
                 st.markdown(summary.get("content"), unsafe_allow_html=True)

 import openai
 import pinecone
 from postgres_db import query_postgresql_realvest
+import numpy as np
 PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]
 OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
 MAX_LENGTH_DESC = 200
 MATCH_SCORE_THR = 0.0
 TOP_K = 20
+EMBEDDING_VECTOR_DIM = 1536
+ZERO_EMBEDDING_VECTOR = list(np.zeros(EMBEDDING_VECTOR_DIM))
+def query_pinecone(vector=None, top_k: int=3, include_metadata: bool=True, metadata_filter: dict=None, sleep_time: int=10):
     MAX_TRIALS = 5
     trial = 0
     out = None
     while (out is None) and (trial < MAX_TRIALS):
         try:
+            out = st.session_state['index'].query(vector=vector, top_k=top_k, filter=metadata_filter, include_metadata=include_metadata)
             return out
         except pinecone.core.exceptions.PineconeProtocolError as err:
             print(f"Error, sleep! {err}")
     summary = "{summary of all products}"
     """
     NEW_LINE = '\n'
+    PROMPT_PRODUCTS_SUMMARY = f"""
+You are a very sharp and helpful assistant to a group of commercial real estate investors.
+You are about to write a summary comparison of a few products whose information are given below:
+----- DESCRIPTION of PRODUCTS -----
+{ f"{NEW_LINE*2}---{NEW_LINE*2}".join(products) }
+-----------------------------------
+Please write a concise and insightful summary table to compare the products for investors, which should include but not limited to:
+- title
+- product summary
+- category
+- asking price
+- location
+- potential profit margin
+and display the resulting table in HTML.
     """
+    print(f"prompt: {PROMPT_PRODUCTS_SUMMARY}")
     openai.api_key = OPENAI_API_KEY
     completion = openai.ChatCompletion.create(
         model="gpt-4",
         messages=[
             {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": PROMPT_PRODUCTS_SUMMARY}
         ]
     )
     ### call OpenAI text-embedding
     res = openai.Embedding.create(model=EMBEDDING_MODEL, input=[query], api_key=OPENAI_API_KEY)
     xq = res['data'][0]['embedding']
+    out = query_pinecone(vector=xq, top_k=TOP_K, include_metadata=True)
     if (out is not None) and ('matches' in out):
         metadata = {match['metadata']['product_id']: match['metadata'] for match in out['matches'] if 'metadata' in match and match['metadata'] is not None}
     with summary_container.container():
         st.header('Summary')
         if st.button('Compare Products'):
+            # populate pids that are checked
+            relevant_pids = [key.split('__')[-1] for key in st.session_state['checked_boxes']]
+            relevant_pids = list(set(relevant_pids))
+            # get metadata from pinecone
+            metadata_filter = {
+                'product_id': {"$in": relevant_pids}
+            }
+            results = query_pinecone(
+                vector=ZERO_EMBEDDING_VECTOR,
+                top_k=100,
+                include_metadata=True,
+                metadata_filter=metadata_filter
+            )
+            # organize document by product_id
+            documents = {}
+            for res in results['matches']:
+                pid, chunk_id = res['id'].split('-')
+                if pid not in documents:
+                    documents[pid] = {}
+                if "chunk" not in documents[pid]:
+                    documents[pid]['chunk'] = {}
+                documents[pid]['chunk'][chunk_id] = res['metadata']['document']
+            # concatenate documents
             products = []
+            for pid, doc in documents.items():
                 products.append(
+                    doc['chunk']['1'] + '\n\n' + doc['chunk']['2']
                 )
+            # summarize
             with st.spinner('Summarizing...'):
                 summary = summarize_products(products)
                 st.markdown(summary.get("content"), unsafe_allow_html=True)