neobot commited on
Commit
72f0d06
·
2 Parent(s): f4e7e13 85c6280

Yong persist Pinecone index

Browse files
Files changed (3) hide show
  1. .gitignore +102 -0
  2. README.md +1 -1
  3. app.py +89 -70
.gitignore CHANGED
@@ -1 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  .streamlit/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ########################################################################
2
+ # Python - https://github.com/github/gitignore/blob/master/Python.gitignore
3
+ ########################################################################
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+
9
+ # Distribution / packaging
10
+ build/
11
+ dist/
12
+ eggs/
13
+ .eggs/
14
+ *.egg-info/
15
+ *.egg
16
+ .prerelease-version
17
+
18
+ # Unit test / coverage reports
19
+ .coverage
20
+ .coverage\.*
21
+ .pytest_cache/
22
+ .mypy_cache/
23
+ test-reports
24
+ htmlcov
25
+ .hypothesis
26
+
27
+ # Test fixtures
28
+ cffi_bin
29
+
30
+ # Pyenv Stuff
31
+ .python-version
32
+ venv
33
+
34
+ # Autogenerated Protobufs
35
+ lib/streamlit/proto/*_pb2.py
36
+ lib/streamlit/proto/*_pb2.pyi
37
+ frontend/src/lib/proto.js
38
+ frontend/src/lib/proto.d.ts
39
+
40
+ ########################################################################
41
+ # OSX - https://github.com/github/gitignore/blob/master/Global/macOS.gitignore
42
+ ########################################################################
43
+ .DS_Store
44
+ .DocumentRevisions-V100
45
+ .fseventsd
46
+ .Spotlight-V100
47
+ .TemporaryItems
48
+ .Trashes
49
+ .VolumeIcon.icns
50
+ .com.apple.timemachine.donotpresent
51
+
52
+ ########################################################################
53
+ # node - https://github.com/github/gitignore/blob/master/Node.gitignore
54
+ ########################################################################
55
+ # Logs
56
+ npm-debug.log*
57
+ yarn-debug.log*
58
+ yarn-error.log*
59
+
60
+ # Dependency directories
61
+ node_modules/
62
+
63
+ # ESLint
64
+ .eslintcache
65
+
66
+ # Coverage directory used by tools like istanbul
67
+ coverage/
68
+
69
+ ########################################################################
70
+ # Streamlit
71
+ ########################################################################
72
  .streamlit/
73
+ lib/streamlit/static
74
+ streamlit-storage
75
+
76
+ # Data Files
77
+ **/uber-raw-data-sep14.csv
78
+
79
+ lib/Pipfile.lock
80
+
81
+ ########################################################################
82
+ # JetBrains
83
+ ########################################################################
84
+ .idea
85
+
86
+ ########################################################################
87
+ # Cypress
88
+ ########################################################################
89
+ frontend/cypress/downloads
90
+ frontend/cypress/videos
91
+ frontend/cypress/screenshots
92
+ __diff_output__
93
+ cypress.env.json
94
+ frontend/test_results
95
+
96
+ # Ignore screenshots that don't get used in CircleCI
97
+ frontend/cypress/snapshots/darwin
98
+ frontend/cypress/snapshots/linux/1x
99
+
100
+ ########################################################################
101
+ # VSCode
102
+ ########################################################################
103
+ .vscode/
README.md CHANGED
@@ -16,7 +16,7 @@ git clone https://huggingface.co/spaces/realvest/realvest-app
16
  pyenv install 3.9
17
  pyenv local 3.9
18
 
19
- poetry use env 3.9.17
20
  poetry install
21
  poetry shell
22
  ```
 
16
  pyenv install 3.9
17
  pyenv local 3.9
18
 
19
+ poetry env use 3.9.17
20
  poetry install
21
  poetry shell
22
  ```
app.py CHANGED
@@ -12,39 +12,26 @@ MAX_LENGTH_DESC = 200
12
  MATCH_SCORE_THR = 0.0
13
  TOP_K = 20
14
 
15
- def test_pinecone(sleep_time: int=1):
16
- MAX_TRIALS = 5
17
- trial = 0
18
- stats = None
19
- while (stats is None) and (trial < MAX_TRIALS):
20
- try:
21
- print(f"BEFORE: trial: {trial}; stats: {stats}")
22
- stats = index.describe_index_stats()
23
- print(f"AFTER: trial: {trial}; stats: {stats}")
24
- return stats
25
- except pinecone.core.exceptions.PineconeProtocolError as err:
26
- print(f"Error, sleep! {err}")
27
- sleep(sleep_time)
28
- trial = trial + 1
29
-
30
- raise Exception(f'max trials {MAX_TRIALS} Exceeded!')
31
 
32
- def query_pinecone(xq, top_k: int=3, include_metadata: bool=True, sleep_time: int=1):
 
 
 
33
  MAX_TRIALS = 5
34
  trial = 0
35
  out = None
36
  while (out is None) and (trial < MAX_TRIALS):
37
  try:
38
  # print(f"BEFORE: trial: {trial}; stats: {out}")
39
- out = index.query(xq, top_k=top_k, include_metadata=include_metadata)
40
  # print(f"AFTER: trial: {trial}; stats: {out}")
41
  return out
42
  except pinecone.core.exceptions.PineconeProtocolError as err:
43
  print(f"Error, sleep! {err}")
44
  sleep(sleep_time)
45
  trial = trial + 1
46
-
47
- raise Exception(f'max trials {MAX_TRIALS} Exceeded!')
48
 
49
  def sort_dict_by_value(d: dict, ascending: bool=True):
50
  """
@@ -53,7 +40,37 @@ def sort_dict_by_value(d: dict, ascending: bool=True):
53
  """
54
  return sorted(d.items(), key=lambda x: x[1], reverse=not ascending)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def init_session_state():
 
 
 
 
 
57
  if 'display_results' not in st.session_state:
58
  st.session_state['display_results'] = False
59
 
@@ -105,14 +122,9 @@ def summarize_products(products: list) -> str:
105
  summary = completion.choices[0].message
106
  return summary
107
 
108
- # initialize connection to pinecone (get API key at app.pinecone.io)
109
- pinecone.init(
110
- api_key=PINECONE_API_KEY,
111
- environment="us-central1-gcp" # may be different, check at app.pinecone.io
112
- )
113
- index = pinecone.Index(INDEX_NAME)
114
- # stats = test_pinecone()
115
- # print(f"Pinecone DB stats: {stats}")
116
 
117
  ### Main
118
  # st.set_page_config(layout="centered")
@@ -133,53 +145,60 @@ query = st.text_input("What are you looking for?")
133
  if st.button('Submit'):
134
 
135
  # initialize
136
- st.session_state.clear()
137
- init_session_state()
138
 
139
  # ### call OpenAI text-embedding
140
  res = openai.Embedding.create(model=EMBEDDING_MODEL, input=[query], api_key=OPENAI_API_KEY)
141
  xq = res['data'][0]['embedding']
142
  out = query_pinecone(xq, top_k=TOP_K, include_metadata=True)
143
-
144
- ### candidates
145
- metadata = {match['metadata']['product_id']: match['metadata'] for match in out['matches']}
146
- match_score = {match['metadata']['product_id']: match['score'] for match in out['matches']}
147
- above_thr_sorted = [
148
- item
149
- for item in sort_dict_by_value(match_score, ascending=False)
150
- if item[1] > MATCH_SCORE_THR
151
- ]
152
- pids = metadata.keys()
153
-
154
- ### query pids
155
- pids_str = [f"'{pid}'" for pid, _ in above_thr_sorted]
156
- query = f"""
157
- SELECT productid, name, category, alternatename, url, logo, description
158
- FROM main_products
159
- WHERE productid in ({', '.join(pids_str)});
160
- """
161
-
162
- results = query_postgresql_realvest(query)
163
- results = {
164
- result['productid']: result
165
- for result in results
166
- }
167
-
168
- ### For test
169
- # print(f"above_thr_sorted: {above_thr_sorted}")
170
- # print(f"results: {results}")
171
- # print(f"metadata: {metadata}")
172
- # # TEST ONLY
173
- # above_thr_sorted = [('2086773', 0.800059378), ('1951083', 0.797319531), ('1998714', 0.795623)]
174
- # results = {'1951083': {'productid': '1951083', 'name': '2 for 1 Turn-key Business Opportunity in Lynnwood, Washington - BizBuySell', 'category': 'Other', 'alternatename': None, 'url': 'https://www.bizbuysell.com/Business-Opportunity/2-for-1-Turn-key-Business-Opportunity/1951083/', 'logo': 'https://images.bizbuysell.com/shared/listings/195/1951083/87198e08-a191-4d97-a33b-9e9f40fa02f4-W768.jpg', 'description': 'Your chance to own a successful Korean traditional KBBQ grill restaurant and Korean dive bar. Owner is retiring after 19 years of business. This Korean BBQ restaurant utilizes a traditional grill called "Sot Ttu Kkeong" widely found in Korea. There are 10 separate grilling tables with a unique hood system to eliminate odors immediately. The bar next door may be able to extend hours into the summer. With one shared full kitchen, the new owner will be able to maximize business and potentially earn double income.'}, '1998714': {'productid': '1998714', 'name': 'Portland CPA Firm in Portland, Oregon - BizBuySell', 'category': 'Accounting and Tax Practices', 'alternatename': None, 'url': 'https://www.bizbuysell.com/Business-Opportunity/Portland-CPA-Firm/1998714/', 'logo': 'https://images.bizbuysell.com/shared/listings/199/1998714/cd02bdb9-32c9-409d-b82e-d0531c12eb39-W768.jpg', 'description': 'OR1002: UPDATED :The seller of this Portland CPA firm is approaching retirement and ready to sell the firm. The firm has a great reputation, has good systems in place, is paperless, and has a great staff. The mix of services offers a consistent stream of cash flow to the owner. The seller is seeking a CPA buyer. The office space is available for continued lease after the sale. Revenues for sale include:7% Accounting, bookkeeping and payroll services26% Income tax preparation services for individual clients35% Income tax preparation services for business and other clients28% Audits and reviews4% Consulting services'}, '2086773': {'productid': '2086773', 'name': 'Asian Grocery Supermarket, 1 owner for 29 years in Salem, Oregon - BizBuySell', 'category': 'Grocery Stores and Supermarkets', 'alternatename': None, 'url': 'https://www.bizbuysell.com/Business-Real-Estate-For-Sale/Asian-Grocery-Supermarket-1-owner-for-29-years/2086773/', 'logo': 'https://images.bizbuysell.com/shared/listings/208/2086773/861f6ba6-a994-4e90-9c62-0a593dae2a31-W768.jpg', 'description': 'Great location, well established and profitable supermarket.We have been the sole owner for almost 29 years, so business boasts of a great reputation.'}}
175
- # metadata = {'2086773': {'asking_price': 1000000.0, 'asking_price_currency': 'USD', 'building_status': 'Established', 'category': 'Grocery Stores and Supermarkets', 'chunk_type': 'profile', 'city': 'Salem', 'document': '# Listing Profile\n \nAsking Price (USD): 1000000 \n\nReason for Selling: Retire ', 'listing_type': 'Retail', 'location': 'Salem, OR', 'main_category': 'Grocery Stores and Supermarkets', 'offer_type': 'Offer', 'offers__available_from__address__locality': 'Salem', 'offers__available_from__address__region': 'Oregon', 'offers__available_from__address__type': 'PostalAddress', 'offers__available_from__type': 'Place', 'product_id': '2086773', 'similar_pids': ['2074401', '2087795', '2068650'], 'state_code': 'OR'}, '1951083': {'asking_price': 200000.0, 'asking_price_currency': 'USD', 'category': 'Other', 'chunk_type': 'profile', 'city': 'Lynnwood', 'document': '# Listing Profile\n \nAsking Price (USD): 200000 \n\nReason for Selling: Retiring ', 'location': 'Lynnwood, WA', 'main_category': 'Other', 'offer_type': 'Offer', 'offers__available_from__address__locality': 'Lynnwood', 'offers__available_from__address__region': 'Washington', 'offers__available_from__address__type': 'PostalAddress', 'offers__available_from__type': 'Place', 'product_id': '1951083', 'similar_pids': ['2113741', '2033980', '2034855'], 'state_code': 'WA'}, '1998714': {'asking_price': 900000.0, 'asking_price_currency': 'USD', 'category': 'Accounting and Tax Practices', 'chunk_type': 'profile', 'city': 'Portland', 'document': '# Listing Profile\n \nAsking Price (USD): 900000 \n\nReason for Selling: Approaching retirement ', 'fin__gross_revenue': 958000.0, 'location': 'Portland, OR', 'main_category': 'Accounting and Tax Practices', 'offer_type': 'Offer', 'offers__available_from__address__locality': 'Portland', 'offers__available_from__address__region': 'Oregon', 'offers__available_from__address__type': 'PostalAddress', 'offers__available_from__type': 'Place', 'product_id': '1998714', 'similar_pids': ['2026155', '2066311'], 'state_code': 'OR'}}
176
-
177
- # update
178
- st.session_state['above_thr_sorted'] = above_thr_sorted
179
- st.session_state['results'] = results
180
- st.session_state['metadata'] = metadata
181
-
182
- st.session_state['display_results'] = True
 
 
 
 
 
 
 
183
 
184
  if st.session_state['display_results']:
185
 
 
12
  MATCH_SCORE_THR = 0.0
13
  TOP_K = 20
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+
17
+
18
+
19
+ def query_pinecone(xq, top_k: int=3, include_metadata: bool=True, sleep_time: int=10):
20
  MAX_TRIALS = 5
21
  trial = 0
22
  out = None
23
  while (out is None) and (trial < MAX_TRIALS):
24
  try:
25
  # print(f"BEFORE: trial: {trial}; stats: {out}")
26
+ out = st.session_state['index'].query(xq, top_k=top_k, include_metadata=include_metadata)
27
  # print(f"AFTER: trial: {trial}; stats: {out}")
28
  return out
29
  except pinecone.core.exceptions.PineconeProtocolError as err:
30
  print(f"Error, sleep! {err}")
31
  sleep(sleep_time)
32
  trial = trial + 1
33
+ return out
34
+ #raise Exception(f'max trials {MAX_TRIALS} Exceeded!')
35
 
36
  def sort_dict_by_value(d: dict, ascending: bool=True):
37
  """
 
40
  """
41
  return sorted(d.items(), key=lambda x: x[1], reverse=not ascending)
42
 
43
+ # initialize connection to pinecone (get API key at app.pinecone.io)
44
+ from tenacity import retry, stop_after_attempt, wait_fixed
45
+ # initialize connection to pinecone (get API key at app.pinecone.io)
46
+ pinecone.init(
47
+ api_key=PINECONE_API_KEY,
48
+ environment="us-central1-gcp" # may be different, check at app.pinecone.io
49
+ )
50
+
51
+
52
+ @retry(stop=stop_after_attempt(5), wait=wait_fixed(15))
53
+ def setup_pinecone_index():
54
+ try:
55
+ print("Attempting to set up Pinecone index...") # add this line
56
+ if "index" not in st.session_state:
57
+ st.session_state['index'] = pinecone.Index(INDEX_NAME)
58
+ return st.session_state['index']
59
+ except AttributeError as e:
60
+ print("Caught an AttributeError:", e)
61
+ raise # Re-raise the exception so that tenacity can catch it and retry
62
+ except Exception as e: # add this block
63
+ print("Caught an unexpected exception:", e)
64
+ raise
65
+
66
+
67
+
68
  def init_session_state():
69
+ try:
70
+ st.session_state['index'] = setup_pinecone_index()
71
+ # stats = test_pinecone()
72
+ except Exception as e:
73
+ print("Failed to set up Pinecone index after several attempts. Error:", e)
74
  if 'display_results' not in st.session_state:
75
  st.session_state['display_results'] = False
76
 
 
122
  summary = completion.choices[0].message
123
  return summary
124
 
125
+
126
+
127
+
 
 
 
 
 
128
 
129
  ### Main
130
  # st.set_page_config(layout="centered")
 
145
  if st.button('Submit'):
146
 
147
  # initialize
148
+ #st.session_state.clear()
149
+ #init_session_state()
150
 
151
  # ### call OpenAI text-embedding
152
  res = openai.Embedding.create(model=EMBEDDING_MODEL, input=[query], api_key=OPENAI_API_KEY)
153
  xq = res['data'][0]['embedding']
154
  out = query_pinecone(xq, top_k=TOP_K, include_metadata=True)
155
+
156
+ if out is not None and 'matches' in out:
157
+ metadata = {match['metadata']['product_id']: match['metadata'] for match in out['matches'] if 'metadata' in match and match['metadata'] is not None}
158
+
159
+
160
+ ### candidates
161
+ metadata = {match['metadata']['product_id']: match['metadata'] for match in out['matches']}
162
+ match_score = {match['metadata']['product_id']: match['score'] for match in out['matches']}
163
+ above_thr_sorted = [
164
+ item
165
+ for item in sort_dict_by_value(match_score, ascending=False)
166
+ if item[1] > MATCH_SCORE_THR
167
+ ]
168
+ pids = metadata.keys()
169
+
170
+ ### query pids
171
+ pids_str = [f"'{pid}'" for pid, _ in above_thr_sorted]
172
+ query = f"""
173
+ SELECT productid, name, category, alternatename, url, logo, description
174
+ FROM main_products
175
+ WHERE productid in ({', '.join(pids_str)});
176
+ """
177
+
178
+ results = query_postgresql_realvest(query)
179
+ results = {
180
+ result['productid']: result
181
+ for result in results
182
+ }
183
+
184
+ ### For test
185
+ # print(f"above_thr_sorted: {above_thr_sorted}")
186
+ # print(f"results: {results}")
187
+ # print(f"metadata: {metadata}")
188
+ # # TEST ONLY
189
+ # above_thr_sorted = [('2086773', 0.800059378), ('1951083', 0.797319531), ('1998714', 0.795623)]
190
+ # results = {'1951083': {'productid': '1951083', 'name': '2 for 1 Turn-key Business Opportunity in Lynnwood, Washington - BizBuySell', 'category': 'Other', 'alternatename': None, 'url': 'https://www.bizbuysell.com/Business-Opportunity/2-for-1-Turn-key-Business-Opportunity/1951083/', 'logo': 'https://images.bizbuysell.com/shared/listings/195/1951083/87198e08-a191-4d97-a33b-9e9f40fa02f4-W768.jpg', 'description': 'Your chance to own a successful Korean traditional KBBQ grill restaurant and Korean dive bar. Owner is retiring after 19 years of business. This Korean BBQ restaurant utilizes a traditional grill called "Sot Ttu Kkeong" widely found in Korea. There are 10 separate grilling tables with a unique hood system to eliminate odors immediately. The bar next door may be able to extend hours into the summer. With one shared full kitchen, the new owner will be able to maximize business and potentially earn double income.'}, '1998714': {'productid': '1998714', 'name': 'Portland CPA Firm in Portland, Oregon - BizBuySell', 'category': 'Accounting and Tax Practices', 'alternatename': None, 'url': 'https://www.bizbuysell.com/Business-Opportunity/Portland-CPA-Firm/1998714/', 'logo': 'https://images.bizbuysell.com/shared/listings/199/1998714/cd02bdb9-32c9-409d-b82e-d0531c12eb39-W768.jpg', 'description': 'OR1002: UPDATED :The seller of this Portland CPA firm is approaching retirement and ready to sell the firm. The firm has a great reputation, has good systems in place, is paperless, and has a great staff. The mix of services offers a consistent stream of cash flow to the owner. The seller is seeking a CPA buyer. The office space is available for continued lease after the sale. Revenues for sale include:7% Accounting, bookkeeping and payroll services26% Income tax preparation services for individual clients35% Income tax preparation services for business and other clients28% Audits and reviews4% Consulting services'}, '2086773': {'productid': '2086773', 'name': 'Asian Grocery Supermarket, 1 owner for 29 years in Salem, Oregon - BizBuySell', 'category': 'Grocery Stores and Supermarkets', 'alternatename': None, 'url': 'https://www.bizbuysell.com/Business-Real-Estate-For-Sale/Asian-Grocery-Supermarket-1-owner-for-29-years/2086773/', 'logo': 'https://images.bizbuysell.com/shared/listings/208/2086773/861f6ba6-a994-4e90-9c62-0a593dae2a31-W768.jpg', 'description': 'Great location, well established and profitable supermarket.We have been the sole owner for almost 29 years, so business boasts of a great reputation.'}}
191
+ # metadata = {'2086773': {'asking_price': 1000000.0, 'asking_price_currency': 'USD', 'building_status': 'Established', 'category': 'Grocery Stores and Supermarkets', 'chunk_type': 'profile', 'city': 'Salem', 'document': '# Listing Profile\n \nAsking Price (USD): 1000000 \n\nReason for Selling: Retire ', 'listing_type': 'Retail', 'location': 'Salem, OR', 'main_category': 'Grocery Stores and Supermarkets', 'offer_type': 'Offer', 'offers__available_from__address__locality': 'Salem', 'offers__available_from__address__region': 'Oregon', 'offers__available_from__address__type': 'PostalAddress', 'offers__available_from__type': 'Place', 'product_id': '2086773', 'similar_pids': ['2074401', '2087795', '2068650'], 'state_code': 'OR'}, '1951083': {'asking_price': 200000.0, 'asking_price_currency': 'USD', 'category': 'Other', 'chunk_type': 'profile', 'city': 'Lynnwood', 'document': '# Listing Profile\n \nAsking Price (USD): 200000 \n\nReason for Selling: Retiring ', 'location': 'Lynnwood, WA', 'main_category': 'Other', 'offer_type': 'Offer', 'offers__available_from__address__locality': 'Lynnwood', 'offers__available_from__address__region': 'Washington', 'offers__available_from__address__type': 'PostalAddress', 'offers__available_from__type': 'Place', 'product_id': '1951083', 'similar_pids': ['2113741', '2033980', '2034855'], 'state_code': 'WA'}, '1998714': {'asking_price': 900000.0, 'asking_price_currency': 'USD', 'category': 'Accounting and Tax Practices', 'chunk_type': 'profile', 'city': 'Portland', 'document': '# Listing Profile\n \nAsking Price (USD): 900000 \n\nReason for Selling: Approaching retirement ', 'fin__gross_revenue': 958000.0, 'location': 'Portland, OR', 'main_category': 'Accounting and Tax Practices', 'offer_type': 'Offer', 'offers__available_from__address__locality': 'Portland', 'offers__available_from__address__region': 'Oregon', 'offers__available_from__address__type': 'PostalAddress', 'offers__available_from__type': 'Place', 'product_id': '1998714', 'similar_pids': ['2026155', '2066311'], 'state_code': 'OR'}}
192
+
193
+ # update
194
+ st.session_state['above_thr_sorted'] = above_thr_sorted
195
+ st.session_state['results'] = results
196
+ st.session_state['metadata'] = metadata
197
+
198
+ st.session_state['display_results'] = True
199
+ else:
200
+ print("No matches found.")
201
+ metadata = {}
202
 
203
  if st.session_state['display_results']:
204