Spaces:

ibrahimnomad
/

Europe_Hotel_Recommendation

Sleeping

App Files Files Community

Europe_Hotel_Recommendation / app.py

ibrahimnomad

Update app.py

7e5c13e verified 9 months ago

raw

history blame

5.58 kB

	# import packages and data
	import nltk
	nltk.download('wordnet')
	nltk.download('punkt')
	nltk.download('stopwords')
	import numpy as np
	import pandas as pd
	import plotly.express as px
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem.wordnet import WordNetLemmatizer
	from ast import literal_eval
	import streamlit as st
	df = pd.read_csv("modified_hotel.csv")

	st.title('Hotel Recommendation by Text :compass:')
	st.image('https://www.peninsula.com/en/-/media/news-room/about-us/hkcompany-profile.png?mw=905&hash=3A250365D26358DC9362972842F9321A')

	st.write("#### Data Frame")
	st.write("I dropped 30 percent of the original data to fit the file limit, we have 361K rows each accounting and individual comment. with no empty values. Review Dates range from 2015 to 2017.")
	st.dataframe(df.sample(1000))

	st.write("The negativity of all the reviews in ascending order. I will drop reviews above 20 negativity.")
	fig = px.histogram(df, x='Review_Total_Negative_Word_Counts')
	st.plotly_chart(fig)
	st.write("The reviewer scores in ascending order. I will drop scores below 8.")
	fig = px.histogram(df, x='Reviewer_Score')
	st.plotly_chart(fig)

	st.write("The hotels are from capital cities of 6 countries: Netherlands, United Kingdom, France, Spain, Italy, Austria ")
	st.write("#### Scatter Plot of Latitude vs Longitude")
	fig = px.scatter(df, x="lng", y="lat")
	st.plotly_chart(fig)
	st.write("#### Western Europe Map")
	st.image('https://i.pinimg.com/736x/7b/98/94/7b9894e2f3059e6b1e2fa808516e57b7.jpg')

	df.rename(columns={'lng': 'lon'}, inplace=True)
	mapdf = df
	mapdf.dropna(subset=['lon', 'lat'], inplace=True)
	st.map(mapdf)


	st.write("#### Cleaned Data Frame")
	st.write("We have 190K of good reviews now, with tags and country info ready to process.")
	# drop bad reviews
	df = df[(df['Review_Total_Negative_Word_Counts'] <= 20) & (df['Reviewer_Score'] >= 8)].reset_index(drop=True)
	# Now I will split the address and pick the last word in the address to identify the country
	df["Country"] = df["Hotel_Address"].apply(lambda x: x.split(' ')[-1])
	# Drop unnecessary columns
	df.drop(columns=['Additional_Number_of_Scoring',
	'Review_Date',
	'Reviewer_Nationality',
	'Negative_Review',
	'Review_Total_Negative_Word_Counts',
	'Total_Number_of_Reviews',
	'Positive_Review',
	'Review_Total_Positive_Word_Counts',
	'Total_Number_of_Reviews_Reviewer_Has_Given',
	'Reviewer_Score',
	'days_since_review',
	'lat',
	'lng'], inplace=True)
	# convert the strings of list into a normal list
	def impute(column):
	column = column[0]
	if (type(column) != list):
	return "".join(literal_eval(column))
	else:
	return column
	# apply it to the “Tags” column
	df["Tags"] = df[["Tags"]].apply(impute, axis=1)
	#lowercase
	df['Country'] = df['Country'].str.lower()
	df['Tags'] = df['Tags'].str.lower()
	country_names = {
	'kingdom': 'London - UK',
	'france': 'Paris - France',
	'netherlands': 'Amsterdam - Netherlands',
	'spain': 'Madrid - Spain',
	'italy': 'Rome - Italy',
	'austria': 'Vienna - Austria',
	}
	df['Country'] = df['Country'].map(country_names)
	st.dataframe(df.sample(100))


	#FUNCTION
	def recommend_hotel(location, description):
	# Lowercase the description and tokenize it
	description = description.lower()
	description_tokens = word_tokenize(description)

	# Initialize stopwords and lemmatizer
	stop_words = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	# Remove stopwords and lemmatize the tokens
	filtered_tokens = [lemmatizer.lemmatize(token) for token in description_tokens if token not in stop_words]

	# Convert filtered tokens to a set for faster lookup
	filtered_set = set(filtered_tokens)

	# Filter hotels by location
	country = df[df['Country'].str.lower() == location.lower()].copy()

	# Compute similarity scores
	cos = []
	for tags in country['Tags']:
	temp_tokens = word_tokenize(tags)
	temp_filtered = [lemmatizer.lemmatize(token) for token in temp_tokens if token not in stop_words]
	temp_set = set(temp_filtered)
	similarity = len(temp_set.intersection(filtered_set))
	cos.append(similarity)

	# Add similarity scores to DataFrame
	country['similarity'] = cos

	# Drop duplicates based on hotel name, keeping the highest similarity
	country.sort_values(by=['Hotel_Name', 'similarity'], ascending=[True, False], inplace=True)
	country.drop_duplicates(subset='Hotel_Name', keep='first', inplace=True)
	country.reset_index(drop=True, inplace=True)
	country.index += 1

	# Select relevant columns and return top recommendations
	return country[['Hotel_Name', 'Average_Score', 'Hotel_Address']].head(10)

	st.write('I have defined my function. Lets use it!')
	st.write('## Europe Hotel Recommendation')
	st.write('Pick a city you are plaining to go and describe what you want. It works like magic! :magic_wand:')
	# Dropdown for location selection
	location = st.selectbox('Select Location', df['Country'].unique())
	#description
	description = st.text_input('Enter Description', "A standard double room for 5 nights for a leisure trip.")
	# Button to trigger recommendation
	if st.button('Find Best Hotels'):
	recommended_hotels = recommend_hotel(location, description)
	st.write('### Top 10 Recommended Hotels')
	st.write(recommended_hotels)
	st.write('HAVE A GOOD TRIP :wave:')