Umang-Bansal commited on
Commit
bb1b68b
·
verified ·
1 Parent(s): 1ae9bef

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +210 -0
  2. functions.py +162 -0
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from functions import *
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ def initialize_session_state():
9
+ if 'processing_complete' not in st.session_state:
10
+ st.session_state['processing_complete'] = False
11
+ if 'results_df' not in st.session_state:
12
+ st.session_state['results_df'] = None
13
+ if 'output_choice' not in st.session_state:
14
+ st.session_state['output_choice'] = "Download CSV"
15
+
16
+ initialize_session_state()
17
+
18
+ def main():
19
+ st.title("InfoSynth")
20
+
21
+ df = None
22
+
23
+ # File upload section
24
+ st.header("1. Upload Your Data")
25
+ data_source = st.radio("Choose a data source:", ["CSV File", "Google Sheet"])
26
+
27
+ if data_source == "CSV File":
28
+ uploaded_file = st.file_uploader("Choose a CSV file", type=['csv'])
29
+
30
+ if uploaded_file is not None:
31
+ df = pd.read_csv(uploaded_file)
32
+ else:
33
+ st.info(
34
+ "Before proceeding, ensure your Google Sheet is shared with the service account. "
35
+ "You can find the service account email in your credentials.json file."
36
+ )
37
+ spreadsheet_id = st.text_input(
38
+ "Enter Google Spreadsheet ID",
39
+ help="You can find this in the spreadsheet URL between /d/ and /edit"
40
+ )
41
+
42
+ sheet_names = None
43
+ if spreadsheet_id:
44
+ try:
45
+ sheet_names = get_all_sheet_names(spreadsheet_id)
46
+ if not sheet_names:
47
+ st.error("No sheets found in this spreadsheet. Please check the ID and permissions.")
48
+ except ValueError as e:
49
+ st.error(f"Error accessing spreadsheet: {str(e)}")
50
+ st.info("Please check the ID and permissions.")
51
+ except Exception as e:
52
+ st.error(f"Error accessing spreadsheet: {str(e)}")
53
+ sheet_names = []
54
+
55
+ sheet_name = None
56
+ if sheet_names:
57
+ sheet_name = st.selectbox(
58
+ "Select Sheet Name",
59
+ options=sheet_names,
60
+ help="The name of the specific sheet to read from"
61
+ )
62
+
63
+ if spreadsheet_id and sheet_name:
64
+ try:
65
+ df = load_google_sheet(spreadsheet_id, sheet_name)
66
+ if df is None or df.empty:
67
+ st.error("No data found in the selected sheet.")
68
+ except Exception as e:
69
+ st.error(f"Error loading sheet data: {str(e)}")
70
+ df = None
71
+
72
+ if df is not None:
73
+ try:
74
+ # Display available columns for selection
75
+ st.header("2. Select Primary Column")
76
+ primary_column = st.selectbox(
77
+ "Choose the main column for analysis:",
78
+ options=df.columns.tolist()
79
+ )
80
+
81
+ # Show data preview
82
+ st.header("3. Data Preview")
83
+ st.write("First 5 rows of your data:")
84
+ st.dataframe(df.head())
85
+
86
+ # Add Query Template Section
87
+ st.header("4. Query Template")
88
+ st.write(f"""
89
+ Create your query template using {primary_column} as a placeholder.
90
+ Example: "What products does {primary_column} offer?"
91
+ """)
92
+
93
+ query_template = st.text_area(
94
+ "Enter your query template:",
95
+ value=f"Tell me about {{{primary_column}}}",
96
+ help=f"Use {{{primary_column}}} as a placeholder"
97
+ )
98
+
99
+ # Preview generated queries
100
+ #if st.button("Preview Generated Queries"):
101
+ # st.subheader("Generated Queries Preview")
102
+ # # Get first 5 values from the selected column
103
+ # sample_values = df[primary_column].head()
104
+ #
105
+ # # Display example queries
106
+ # for value in sample_values:
107
+ # generated_query = query_template.replace(
108
+ # f"{{{primary_column}}}", str(value)
109
+ # )
110
+ # st.write(f"- {generated_query}")
111
+ #
112
+ # # Show total number of queries that will be generated
113
+ # st.info(f"Total queries to be generated: {len(df)}")
114
+
115
+ # Add confirmation and processing section
116
+ st.header("5. Process Queries")
117
+ total_queries = len(df[primary_column])
118
+ estimated_time = total_queries * 2 # 2 second per query due to rate limiting
119
+
120
+ st.warning(f"""
121
+ ⚠️ Please confirm:
122
+ - Number of queries to process: {total_queries}
123
+ - Estimated processing time: {estimated_time} seconds ({estimated_time/60:.1f} minutes)
124
+ - This will use {total_queries} API calls
125
+ """)
126
+
127
+ # Show sample of what will be processed
128
+ #st.subheader("Sample of data to be processed:")
129
+ #sample_df = df[[primary_column]].head()
130
+ #st.dataframe(sample_df)
131
+
132
+ # Process button with confirmation
133
+ if st.button("Start Processing"):
134
+ with st.spinner("Processing queries..."):
135
+ # Add a progress bar
136
+ progress_bar = st.progress(0)
137
+
138
+ results = []
139
+ llm = setup_llm()
140
+ for index, row in df.iterrows():
141
+ try:
142
+ value = row[primary_column]
143
+
144
+ # Handle empty/null values
145
+ if pd.isna(value) or str(value).strip() == '':
146
+ results.append({
147
+ 'input_value': value,
148
+ 'result': 'NA'
149
+ })
150
+ continue
151
+
152
+ query = query_template.replace(f"{{{primary_column}}}", str(value))
153
+
154
+ # Display current processing item
155
+ st.text(f"Processing: {value}")
156
+
157
+ # Process query
158
+ result = process_queries(pd.DataFrame([row]), primary_column, query)
159
+ output = process_with_ai(result, query, llm)
160
+
161
+ results.append({
162
+ 'input_value': value,
163
+ 'result': output.content
164
+ })
165
+
166
+ # Update progress
167
+ progress_bar.progress((index + 1) / total_queries)
168
+
169
+ except Exception as e:
170
+ st.error(f"Error processing {value}: {str(e)}")
171
+ continue
172
+
173
+ # Show completion and results
174
+ st.session_state['processing_complete'] = True
175
+ st.session_state['results_df'] = pd.DataFrame(results, columns=['input_value', 'result'])
176
+
177
+ # Show results and save options if processing is complete
178
+ if st.session_state['processing_complete']:
179
+ st.success(f"✅ Completed processing {len(st.session_state['results_df'])} queries!")
180
+
181
+ st.subheader("Results Preview:")
182
+ st.dataframe(st.session_state['results_df'].head())
183
+
184
+ st.header("6. Save Results")
185
+ output_choice = st.radio("Choose an output format:", ["Download CSV", "Update Google Sheet"])
186
+
187
+ if output_choice == "Download CSV":
188
+ csv = st.session_state['results_df'].to_csv(index=False)
189
+ if st.download_button(
190
+ "Download Complete Results (CSV)",
191
+ csv,
192
+ "search_results.csv",
193
+ "text/csv",
194
+ key='download-csv'
195
+ ):
196
+ st.success("✅ File downloaded successfully!")
197
+
198
+ elif output_choice == "Update Google Sheet":
199
+ update_button = st.button("Confirm Update to Google Sheet")
200
+ if update_button:
201
+ try:
202
+ write_to_google_sheet(spreadsheet_id, sheet_name, st.session_state['results_df'])
203
+ st.success("✅ Results successfully added as new column!")
204
+ except Exception as e:
205
+ st.error(f"Error updating sheet: {str(e)}")
206
+ except Exception as e:
207
+ st.error(f"Error processing the file: {str(e)}")
208
+
209
+ if __name__ == "__main__":
210
+ main()
functions.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import time
4
+ from typing import List, Dict
5
+ from serpapi import GoogleSearch
6
+ from langchain_groq import ChatGroq
7
+ from langchain.prompts import PromptTemplate
8
+ import gspread
9
+ from google.oauth2.service_account import Credentials
10
+ import pandas as pd
11
+ import os
12
+
13
+ def get_sheet_client():
14
+ """Helper function to create authenticated Google Sheets client"""
15
+ try:
16
+ scope = ["https://www.googleapis.com/auth/spreadsheets"]
17
+ creds = Credentials.from_service_account_file("credentials.json", scopes=scope)
18
+ client = gspread.authorize(creds)
19
+
20
+ # Get service account email for error messages
21
+ service_account_email = creds.service_account_email
22
+ st.session_state['service_account_email'] = service_account_email
23
+
24
+ return client
25
+ except FileNotFoundError:
26
+ raise ValueError(
27
+ "credentials.json file not found. Please ensure it exists in the project directory."
28
+ )
29
+ except Exception as e:
30
+ raise ValueError(f"Error setting up Google Sheets client: {str(e)}")
31
+
32
+ def get_worksheet(sheet_id: str, range_name: str = None):
33
+ """Helper function to get worksheet with improved error handling"""
34
+ try:
35
+ client = get_sheet_client()
36
+ sheet = client.open_by_key(sheet_id)
37
+ return sheet.worksheet(range_name) if range_name else sheet
38
+ except gspread.exceptions.SpreadsheetNotFound:
39
+ service_email = st.session_state.get('service_account_email', 'the service account')
40
+ raise ValueError(
41
+ f"Spreadsheet not found. Please verify:\n"
42
+ f"1. The spreadsheet ID is correct\n"
43
+ f"2. The sheet is shared with {service_email}\n"
44
+ f"3. Sharing permissions allow edit access"
45
+ )
46
+ except gspread.exceptions.WorksheetNotFound:
47
+ raise ValueError(f"Worksheet '{range_name}' not found in the spreadsheet")
48
+ except gspread.exceptions.APIError as e:
49
+ if 'PERMISSION_DENIED' in str(e):
50
+ service_email = st.session_state.get('service_account_email', 'the service account')
51
+ raise ValueError(
52
+ f"Permission denied. Please share the spreadsheet with {service_email} "
53
+ f"and ensure it has edit access."
54
+ )
55
+ raise ValueError(f"Google Sheets API error: {str(e)}")
56
+
57
+ def process_queries(df: pd.DataFrame, primary_column: str, query_template: str) -> List[Dict]:
58
+ results = []
59
+
60
+ serpapi_key = os.getenv("SERPAPI_API_KEY")
61
+ for index, row in df.iterrows():
62
+ try:
63
+ value = row[primary_column]
64
+ query = query_template.replace(f"{{{primary_column}}}", str(value))
65
+
66
+ # Perform search
67
+ search = GoogleSearch({
68
+ "q": query,
69
+ "gl": "in",
70
+ "api_key": serpapi_key,
71
+ "num": 5
72
+ })
73
+ search_results = search.get_dict()
74
+
75
+ # Store results
76
+ results.append({
77
+ primary_column: value,
78
+ "query": query,
79
+ "search_results": search_results.get("organic_results", [])
80
+ })
81
+
82
+ # Rate limiting
83
+ time.sleep(1)
84
+
85
+
86
+ if index % 10 == 0:
87
+ st.write(f"Processed {index + 1} queries...")
88
+
89
+ except Exception as e:
90
+ st.warning(f"Error processing query for {value}: {str(e)}")
91
+ continue
92
+
93
+ return results
94
+
95
+ def setup_llm():
96
+ """Setup LangChain with Groq"""
97
+ api_key=os.getenv("GROQ_API_KEY")
98
+ llm = ChatGroq(
99
+ api_key=api_key,
100
+ model="llama-3.1-8b-instant",
101
+ temperature=0,
102
+ max_tokens=None,
103
+ timeout=None,
104
+ max_retries=2,
105
+ )
106
+ return llm
107
+
108
+ def process_with_ai(search_results: dict, query: str, llm) -> str:
109
+ template = """
110
+ Extract ONLY the specific information requested from the search results for: {query}
111
+
112
+ Search Results:
113
+ {search_results}
114
+
115
+ Provide ONLY the extracted information as a simple text response.
116
+ If multiple items exist, separate them with semicolons.
117
+ If no relevant information is found, respond with "Not found".
118
+
119
+ For example:
120
+ - If asked for locations: "Bengaluru; Mumbai; Delhi"
121
+ - If asked for email: "[email protected]"
122
+ - If asked for address: "123 Main Street, City, Country"
123
+ """
124
+
125
+ prompt = PromptTemplate(
126
+ input_variables=["query", "search_results"],
127
+ template=template
128
+ )
129
+
130
+ chain = prompt | llm
131
+ response = chain.invoke({"query": query, "search_results": search_results})
132
+
133
+ return response
134
+
135
+
136
+ def load_google_sheet(sheet_id: str, range_name: str) -> pd.DataFrame:
137
+ worksheet = get_worksheet(sheet_id,range_name)
138
+ data = worksheet.get_all_records()
139
+ return pd.DataFrame(data)
140
+
141
+
142
+ def write_to_google_sheet(sheet_id: str, range_name: str, results_df: pd.DataFrame):
143
+
144
+ worksheet = get_worksheet(sheet_id, range_name)
145
+
146
+ all_values = worksheet.get_all_values()
147
+ num_rows = len(all_values)
148
+ next_col_num = len(all_values[0]) + 1
149
+ next_col_letter = chr(64 + next_col_num)
150
+
151
+ range = f'{next_col_letter}1:{next_col_letter}{num_rows}'
152
+
153
+ values = [['AI Results']] + [[str(result)] for result in results_df['result']]
154
+
155
+ worksheet.update(values, f'{range}')
156
+
157
+
158
+ def get_all_sheet_names(sheet_id: str) -> List[str]:
159
+
160
+ worksheet = get_worksheet(sheet_id)
161
+ sheets = map(lambda x: x.title, worksheet.worksheets())
162
+ return list(sheets)