File size: 5,583 Bytes
ab6f2e2
 
 
4c20389
70b0e3f
ab6f2e2
 
 
 
 
 
 
 
b8ff5f2
ab6f2e2
 
 
 
 
 
e2d23a5
ab6f2e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7c4307
7e5c13e
 
 
b7c4307
ab6f2e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2d23a5
ab6f2e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# import packages and data
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np
import pandas as pd
import plotly.express as px
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from ast import literal_eval
import streamlit as st
df = pd.read_csv("modified_hotel.csv")

st.title('Hotel Recommendation by Text :compass:')
st.image('https://www.peninsula.com/en/-/media/news-room/about-us/hkcompany-profile.png?mw=905&hash=3A250365D26358DC9362972842F9321A')

st.write("#### Data Frame")
st.write("I dropped 30 percent of the original data to fit the file limit, we have 361K rows each accounting and individual comment. with no empty values. Review Dates range from 2015 to 2017.")
st.dataframe(df.sample(1000))

st.write("The negativity of all the reviews in ascending order. I will drop reviews above 20 negativity.")
fig = px.histogram(df, x='Review_Total_Negative_Word_Counts')
st.plotly_chart(fig)
st.write("The reviewer scores in ascending order. I will drop scores below 8.")
fig = px.histogram(df, x='Reviewer_Score')
st.plotly_chart(fig)

st.write("The hotels are from capital cities of 6 countries: Netherlands, United Kingdom, France, Spain, Italy, Austria ")
st.write("#### Scatter Plot of Latitude vs Longitude")
fig = px.scatter(df, x="lng", y="lat")
st.plotly_chart(fig)
st.write("#### Western Europe Map")
st.image('https://i.pinimg.com/736x/7b/98/94/7b9894e2f3059e6b1e2fa808516e57b7.jpg')

df.rename(columns={'lng': 'lon'}, inplace=True)
mapdf = df
mapdf.dropna(subset=['lon', 'lat'], inplace=True)
st.map(mapdf)


st.write("#### Cleaned Data Frame")
st.write("We have 190K of good reviews now, with tags and country info ready to process.")
# drop bad reviews
df = df[(df['Review_Total_Negative_Word_Counts'] <= 20) & (df['Reviewer_Score'] >= 8)].reset_index(drop=True)
# Now I will split the address and pick the last word in the address to identify the country
df["Country"] = df["Hotel_Address"].apply(lambda x: x.split(' ')[-1])
# Drop unnecessary columns
df.drop(columns=['Additional_Number_of_Scoring',
                 'Review_Date',
                 'Reviewer_Nationality',
                 'Negative_Review',
                 'Review_Total_Negative_Word_Counts',
                 'Total_Number_of_Reviews',
                 'Positive_Review',
                 'Review_Total_Positive_Word_Counts',
                 'Total_Number_of_Reviews_Reviewer_Has_Given',
                 'Reviewer_Score',
                 'days_since_review',
                 'lat',
                 'lng'], inplace=True)
# convert the strings of list into a normal list
def impute(column):
    column = column[0]
    if (type(column) != list):
        return "".join(literal_eval(column))
    else:
        return column
# apply it to the “Tags” column  
df["Tags"] = df[["Tags"]].apply(impute, axis=1)
#lowercase 
df['Country'] = df['Country'].str.lower()
df['Tags'] = df['Tags'].str.lower()
country_names = {
    'kingdom': 'London - UK',
    'france': 'Paris - France',
    'netherlands': 'Amsterdam - Netherlands',
    'spain': 'Madrid - Spain',
    'italy': 'Rome - Italy',
    'austria': 'Vienna - Austria',
}
df['Country'] = df['Country'].map(country_names)
st.dataframe(df.sample(100))


#FUNCTION
def recommend_hotel(location, description):
    # Lowercase the description and tokenize it
    description = description.lower()
    description_tokens = word_tokenize(description)

    # Initialize stopwords and lemmatizer
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Remove stopwords and lemmatize the tokens
    filtered_tokens = [lemmatizer.lemmatize(token) for token in description_tokens if token not in stop_words]

    # Convert filtered tokens to a set for faster lookup
    filtered_set = set(filtered_tokens)

    # Filter hotels by location
    country = df[df['Country'].str.lower() == location.lower()].copy()

    # Compute similarity scores
    cos = []
    for tags in country['Tags']:
        temp_tokens = word_tokenize(tags)
        temp_filtered = [lemmatizer.lemmatize(token) for token in temp_tokens if token not in stop_words]
        temp_set = set(temp_filtered)
        similarity = len(temp_set.intersection(filtered_set))
        cos.append(similarity)

    # Add similarity scores to DataFrame
    country['similarity'] = cos

    # Drop duplicates based on hotel name, keeping the highest similarity
    country.sort_values(by=['Hotel_Name', 'similarity'], ascending=[True, False], inplace=True)
    country.drop_duplicates(subset='Hotel_Name', keep='first', inplace=True)
    country.reset_index(drop=True, inplace=True)
    country.index += 1

    # Select relevant columns and return top recommendations
    return country[['Hotel_Name', 'Average_Score', 'Hotel_Address']].head(10)

st.write('I have defined my function. Lets use it!')
st.write('## Europe Hotel Recommendation')
st.write('Pick a city you are plaining to go and describe what you want. It works like magic! :magic_wand:')
# Dropdown for location selection
location = st.selectbox('Select Location', df['Country'].unique())
#description
description = st.text_input('Enter Description', "A standard double room for 5 nights for a leisure trip.")
# Button to trigger recommendation
if st.button('Find Best Hotels'):
    recommended_hotels = recommend_hotel(location, description)
    st.write('### Top 10 Recommended Hotels')
    st.write(recommended_hotels)
    st.write('HAVE A GOOD TRIP :wave:')