Spaces:
Sleeping
Sleeping
import re | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import gradio as gr | |
from groq import Groq | |
import os | |
from dotenv import load_dotenv | |
# Step 1: Scrape the free courses from Analytics Vidhya | |
url = "https://courses.analyticsvidhya.com/pages/all-free-courses" | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
courses = [] | |
# Extracting course title, image, and course link | |
for course_card in soup.find_all('header', class_='course-card__img-container'): | |
img_tag = course_card.find('img', class_='course-card__img') | |
if img_tag: | |
title = img_tag.get('alt') | |
image_url = img_tag.get('src') | |
link_tag = course_card.find_previous('a') | |
if link_tag: | |
course_link = link_tag.get('href') | |
if not course_link.startswith('http'): | |
course_link = 'https://courses.analyticsvidhya.com' + course_link | |
courses.append({ | |
'title': title, | |
'image_url': image_url, | |
'course_link': course_link | |
}) | |
# Step 2: Create DataFrame | |
df = pd.DataFrame(courses) | |
load_dotenv() | |
client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
def search_courses(query): | |
try: | |
print(f"Searching for: {query}") | |
print(f"Number of courses in database: {len(df)}") | |
# Prepare the prompt for Groq | |
prompt = f"""Given the following query: "{query}" | |
Please analyze the query and rank the following courses based on their relevance to the query. | |
Prioritize courses from Analytics Vidhya. Provide a relevance score from 0 to 1 for each course. | |
Only return courses with a relevance score of 0.5 or higher. | |
Return the results in the following format: | |
Title: [Course Title] | |
Relevance: [Score] | |
Courses: | |
{df['title'].to_string(index=False)} | |
""" | |
print("Sending request to Groq...") | |
# Get response from Groq | |
response = client.chat.completions.create( | |
model="llama-3.2-1b-preview", | |
messages=[ | |
{"role": "system", "content": "You are an AI assistant specialized in course recommendations."}, | |
{"role": "user", "content": prompt} | |
], | |
temperature=0.2, | |
max_tokens=1000 | |
) | |
print("Received response from Groq") | |
# Parse Groq's response | |
results = [] | |
print("Groq response content:") | |
print(response.choices[0].message.content) | |
# Use regex to extract course titles and relevance scores | |
matches = re.findall(r'\*\*(.+?)\*\*\s*\(Relevance Score: (0\.\d+)\)', response.choices[0].message.content) | |
for title, score in matches: | |
title = title.strip() | |
score = float(score) | |
if score >= 0.5: | |
matching_courses = df[df['title'].str.contains(title[:30], case=False, na=False)] | |
if not matching_courses.empty: | |
course = matching_courses.iloc[0] | |
results.append({ | |
'title': course['title'], # Use the full title from the database | |
'image_url': course['image_url'], | |
'course_link': course['course_link'], | |
'score': score | |
}) | |
print(f"Added course: {course['title']}") | |
else: | |
print(f"Warning: Course not found in database: {title}") | |
print(f"Number of results found: {len(results)}") | |
return sorted(results, key=lambda x: x['score'], reverse=True)[:10] # Return top 10 results | |
except Exception as e: | |
print(f"An error occurred in search_courses: {str(e)}") | |
return [] | |
def gradio_search(query): | |
result_list = search_courses(query) | |
if result_list: | |
html_output = '<div class="results-container">' | |
for item in result_list: | |
course_title = item['title'] | |
course_image = item['image_url'] | |
course_link = item['course_link'] | |
relevance_score = round(item['score'] * 100, 2) | |
html_output += f''' | |
<div class="course-card"> | |
<img src="{course_image}" alt="{course_title}" class="course-image"/> | |
<div class="course-info"> | |
<h3>{course_title}</h3> | |
<p>Relevance: {relevance_score}%</p> | |
<a href="{course_link}" target="_blank" class="course-link">View Course</a> | |
</div> | |
</div>''' | |
html_output += '</div>' | |
return html_output | |
else: | |
return '<p class="no-results">No results found. Please try a different query.</p>' | |
custom_css = """ | |
body { | |
font-family: Arial, Helvetica, sans-serif; | |
background-color: #f0f2f5; | |
} | |
.container { | |
max-width: 600px; | |
margin: 0 auto; | |
padding: 20px; | |
} | |
.results-container { | |
display: flex; | |
flex-direction: column; | |
} | |
.course-card { | |
background-color: white; | |
border-radius: 8px; | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); | |
margin-bottom: 20px; | |
overflow: hidden; | |
width: 100%; | |
transition: transform 0.2s; | |
} | |
.course-card:hover { | |
transform: translateY(-5px); | |
} | |
.course-image { | |
width: 100%; | |
height: 200px; | |
object-fit: cover; | |
} | |
.course-info { | |
padding: 15px; | |
} | |
.course-info h3 { | |
margin-top: 0; | |
font-size: 18px; | |
color: #333; | |
} | |
.course-info p { | |
color: #666; | |
font-size: 14px; | |
margin-bottom: 10px; | |
} | |
.course-link { | |
display: inline-block; | |
background-color: #007bff; | |
color: white; | |
padding: 8px 12px; | |
text-decoration: none; | |
border-radius: 4px; | |
font-size: 14px; | |
transition: background-color 0.2s; | |
} | |
.course-link:hover { | |
background-color: #0056b3; | |
} | |
.no-results { | |
text-align: center; | |
color: #666; | |
font-style: italic; | |
} | |
""" | |
# Gradio interface | |
iface = gr.Interface( | |
fn=gradio_search, | |
inputs=gr.Textbox(label="Enter your search query", placeholder="e.g., machine learning, data science, python"), | |
outputs=gr.HTML(label="Search Results"), | |
title="Analytics Vidhya Smart Search Tool🔍🌐", | |
description="Find the most relevant courses from Analytics Vidhya Website based on your query.", | |
theme="huggingface", | |
css=custom_css, | |
examples=[ | |
["Tableau Course"], | |
["Machine Learning/Deep Learning with Python"], | |
["Business Analytics"] | |
], | |
) | |
if __name__ == "__main__": | |
iface.launch(debug=True) |