|
|
|
import nltk |
|
nltk.download('wordnet') |
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.express as px |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem.wordnet import WordNetLemmatizer |
|
from ast import literal_eval |
|
import streamlit as st |
|
df = pd.read_csv("modified_hotel.csv") |
|
|
|
st.title('Hotel Recommendation by Text :compass:') |
|
st.image('https://www.peninsula.com/en/-/media/news-room/about-us/hkcompany-profile.png?mw=905&hash=3A250365D26358DC9362972842F9321A') |
|
|
|
st.write("#### Data Frame") |
|
st.write("I dropped 30 percent of the original data to fit the file limit, we have 361K rows each accounting and individual comment. with no empty values. Review Dates range from 2015 to 2017.") |
|
st.dataframe(df.sample(1000)) |
|
|
|
st.write("The negativity of all the reviews in ascending order. I will drop reviews above 20 negativity.") |
|
fig = px.histogram(df, x='Review_Total_Negative_Word_Counts') |
|
st.plotly_chart(fig) |
|
st.write("The reviewer scores in ascending order. I will drop scores below 8.") |
|
fig = px.histogram(df, x='Reviewer_Score') |
|
st.plotly_chart(fig) |
|
|
|
st.write("The hotels are from capital cities of 6 countries: Netherlands, United Kingdom, France, Spain, Italy, Austria ") |
|
st.write("#### Scatter Plot of Latitude vs Longitude") |
|
fig = px.scatter(df, x="lng", y="lat") |
|
st.plotly_chart(fig) |
|
st.write("#### Western Europe Map") |
|
st.image('https://i.pinimg.com/736x/7b/98/94/7b9894e2f3059e6b1e2fa808516e57b7.jpg') |
|
|
|
df.rename(columns={'lng': 'lon'}, inplace=True) |
|
mapdf = df |
|
mapdf.dropna(subset=['lon', 'lat'], inplace=True) |
|
st.map(mapdf) |
|
|
|
|
|
st.write("#### Cleaned Data Frame") |
|
st.write("We have 190K of good reviews now, with tags and country info ready to process.") |
|
|
|
df = df[(df['Review_Total_Negative_Word_Counts'] <= 20) & (df['Reviewer_Score'] >= 8)].reset_index(drop=True) |
|
|
|
df["Country"] = df["Hotel_Address"].apply(lambda x: x.split(' ')[-1]) |
|
|
|
df.drop(columns=['Additional_Number_of_Scoring', |
|
'Review_Date', |
|
'Reviewer_Nationality', |
|
'Negative_Review', |
|
'Review_Total_Negative_Word_Counts', |
|
'Total_Number_of_Reviews', |
|
'Positive_Review', |
|
'Review_Total_Positive_Word_Counts', |
|
'Total_Number_of_Reviews_Reviewer_Has_Given', |
|
'Reviewer_Score', |
|
'days_since_review', |
|
'lat', |
|
'lng'], inplace=True) |
|
|
|
def impute(column): |
|
column = column[0] |
|
if (type(column) != list): |
|
return "".join(literal_eval(column)) |
|
else: |
|
return column |
|
|
|
df["Tags"] = df[["Tags"]].apply(impute, axis=1) |
|
|
|
df['Country'] = df['Country'].str.lower() |
|
df['Tags'] = df['Tags'].str.lower() |
|
country_names = { |
|
'kingdom': 'London - UK', |
|
'france': 'Paris - France', |
|
'netherlands': 'Amsterdam - Netherlands', |
|
'spain': 'Madrid - Spain', |
|
'italy': 'Rome - Italy', |
|
'austria': 'Vienna - Austria', |
|
} |
|
df['Country'] = df['Country'].map(country_names) |
|
st.dataframe(df.sample(100)) |
|
|
|
|
|
|
|
def recommend_hotel(location, description): |
|
|
|
description = description.lower() |
|
description_tokens = word_tokenize(description) |
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
|
|
|
|
filtered_tokens = [lemmatizer.lemmatize(token) for token in description_tokens if token not in stop_words] |
|
|
|
|
|
filtered_set = set(filtered_tokens) |
|
|
|
|
|
country = df[df['Country'].str.lower() == location.lower()].copy() |
|
|
|
|
|
cos = [] |
|
for tags in country['Tags']: |
|
temp_tokens = word_tokenize(tags) |
|
temp_filtered = [lemmatizer.lemmatize(token) for token in temp_tokens if token not in stop_words] |
|
temp_set = set(temp_filtered) |
|
similarity = len(temp_set.intersection(filtered_set)) |
|
cos.append(similarity) |
|
|
|
|
|
country['similarity'] = cos |
|
|
|
|
|
country.sort_values(by=['Hotel_Name', 'similarity'], ascending=[True, False], inplace=True) |
|
country.drop_duplicates(subset='Hotel_Name', keep='first', inplace=True) |
|
country.reset_index(drop=True, inplace=True) |
|
country.index += 1 |
|
|
|
|
|
return country[['Hotel_Name', 'Average_Score', 'Hotel_Address']].head(10) |
|
|
|
st.write('I have defined my function. Lets use it!') |
|
st.write('## Europe Hotel Recommendation') |
|
st.write('Pick a city you are plaining to go and describe what you want. It works like magic! :magic_wand:') |
|
|
|
location = st.selectbox('Select Location', df['Country'].unique()) |
|
|
|
description = st.text_input('Enter Description', "A standard double room for 5 nights for a leisure trip.") |
|
|
|
if st.button('Find Best Hotels'): |
|
recommended_hotels = recommend_hotel(location, description) |
|
st.write('### Top 10 Recommended Hotels') |
|
st.write(recommended_hotels) |
|
st.write('HAVE A GOOD TRIP :wave:') |
|
|
|
|
|
|
|
|