ibrahimnomad commited on
Commit
ab6f2e2
·
verified ·
1 Parent(s): b3b5d10

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +136 -0
  3. modified_hotel.csv +3 -0
  4. requirements.txt +5 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ modified_hotel.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import packages and data
2
+ import nltk
3
+ nltk.download('wordnet')
4
+ import numpy as np
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ from nltk.corpus import stopwords
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.stem.wordnet import WordNetLemmatizer
10
+ from ast import literal_eval
11
+ import streamlit as st
12
+
13
+ df = pd.read_csv("modified_hotel.csv")
14
+
15
+ st.title('Hotel Recommendation by Text :compass:')
16
+ st.image('https://www.peninsula.com/en/-/media/news-room/about-us/hkcompany-profile.png?mw=905&hash=3A250365D26358DC9362972842F9321A')
17
+
18
+ st.write("#### Data Frame")
19
+ st.write("I dropped 30 percent of the original data to fit the file limit, we have 361K rows each accounting and individual comment. with no empty values. Review Dates range from 2015 to 2017.")
20
+ st.dataframe(df)
21
+
22
+ st.write("The negativity of all the reviews in ascending order. I will drop reviews above 20 negativity.")
23
+ fig = px.histogram(df, x='Review_Total_Negative_Word_Counts')
24
+ st.plotly_chart(fig)
25
+ st.write("The reviewer scores in ascending order. I will drop scores below 8.")
26
+ fig = px.histogram(df, x='Reviewer_Score')
27
+ st.plotly_chart(fig)
28
+
29
+ st.write("The hotels are from capital cities of 6 countries: Netherlands, United Kingdom, France, Spain, Italy, Austria ")
30
+ st.write("#### Scatter Plot of Latitude vs Longitude")
31
+ fig = px.scatter(df, x="lng", y="lat")
32
+ st.plotly_chart(fig)
33
+ st.write("#### Western Europe Map")
34
+ st.image('https://i.pinimg.com/736x/7b/98/94/7b9894e2f3059e6b1e2fa808516e57b7.jpg')
35
+
36
+
37
+ st.write("#### Cleaned Data Frame")
38
+ st.write("We have 190K of good reviews now, with tags and country info ready to process.")
39
+ # drop bad reviews
40
+ df = df[(df['Review_Total_Negative_Word_Counts'] <= 20) & (df['Reviewer_Score'] >= 8)].reset_index(drop=True)
41
+ # Now I will split the address and pick the last word in the address to identify the country
42
+ df["Country"] = df["Hotel_Address"].apply(lambda x: x.split(' ')[-1])
43
+ # Drop unnecessary columns
44
+ df.drop(columns=['Additional_Number_of_Scoring',
45
+ 'Review_Date',
46
+ 'Reviewer_Nationality',
47
+ 'Negative_Review',
48
+ 'Review_Total_Negative_Word_Counts',
49
+ 'Total_Number_of_Reviews',
50
+ 'Positive_Review',
51
+ 'Review_Total_Positive_Word_Counts',
52
+ 'Total_Number_of_Reviews_Reviewer_Has_Given',
53
+ 'Reviewer_Score',
54
+ 'days_since_review',
55
+ 'lat',
56
+ 'lng'], inplace=True)
57
+ # convert the strings of list into a normal list
58
+ def impute(column):
59
+ column = column[0]
60
+ if (type(column) != list):
61
+ return "".join(literal_eval(column))
62
+ else:
63
+ return column
64
+ # apply it to the “Tags” column
65
+ df["Tags"] = df[["Tags"]].apply(impute, axis=1)
66
+ #lowercase
67
+ df['Country'] = df['Country'].str.lower()
68
+ df['Tags'] = df['Tags'].str.lower()
69
+ country_names = {
70
+ 'kingdom': 'London - UK',
71
+ 'france': 'Paris - France',
72
+ 'netherlands': 'Amsterdam - Netherlands',
73
+ 'spain': 'Madrid - Spain',
74
+ 'italy': 'Rome - Italy',
75
+ 'austria': 'Vienna - Austria',
76
+ }
77
+ df['Country'] = df['Country'].map(country_names)
78
+ st.dataframe(df)
79
+
80
+
81
+ #FUNCTION
82
+ def recommend_hotel(location, description):
83
+ # Lowercase the description and tokenize it
84
+ description = description.lower()
85
+ description_tokens = word_tokenize(description)
86
+
87
+ # Initialize stopwords and lemmatizer
88
+ stop_words = set(stopwords.words('english'))
89
+ lemmatizer = WordNetLemmatizer()
90
+
91
+ # Remove stopwords and lemmatize the tokens
92
+ filtered_tokens = [lemmatizer.lemmatize(token) for token in description_tokens if token not in stop_words]
93
+
94
+ # Convert filtered tokens to a set for faster lookup
95
+ filtered_set = set(filtered_tokens)
96
+
97
+ # Filter hotels by location
98
+ country = df[df['Country'].str.lower() == location.lower()].copy()
99
+
100
+ # Compute similarity scores
101
+ cos = []
102
+ for tags in country['Tags']:
103
+ temp_tokens = word_tokenize(tags)
104
+ temp_filtered = [lemmatizer.lemmatize(token) for token in temp_tokens if token not in stop_words]
105
+ temp_set = set(temp_filtered)
106
+ similarity = len(temp_set.intersection(filtered_set))
107
+ cos.append(similarity)
108
+
109
+ # Add similarity scores to DataFrame
110
+ country['similarity'] = cos
111
+
112
+ # Drop duplicates based on hotel name, keeping the highest similarity
113
+ country.sort_values(by=['Hotel_Name', 'similarity'], ascending=[True, False], inplace=True)
114
+ country.drop_duplicates(subset='Hotel_Name', keep='first', inplace=True)
115
+ country.reset_index(drop=True, inplace=True)
116
+ country.index += 1
117
+
118
+ # Select relevant columns and return top recommendations
119
+ return country[['Hotel_Name', 'Average_Score', 'Hotel_Address']].head(10)
120
+
121
+ st.write('I have defined my function. Lets use it!')
122
+ st.write('## Europe Hotel Recommendation')
123
+ st.write('Pick a city you are plaining to go and describe what you want. It works like magic! :magic_wand:')
124
+ # Dropdown for location selection
125
+ location = st.selectbox('Select Location', df['Country'].unique())
126
+ #description
127
+ description = st.text_input('Enter Description', "A standard double room for 5 nights for a leisure trip.")
128
+ # Button to trigger recommendation
129
+ if st.button('Find Best Hotels'):
130
+ recommended_hotels = recommend_hotel(location, description)
131
+ st.write('### Top 10 Recommended Hotels')
132
+ st.write(recommended_hotels)
133
+ st.write('HAVE A GOOD TRIP :wave:')
134
+
135
+
136
+
modified_hotel.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40cbce3b7daea0d727fb5fd4840a122dadcf7ae701e0b749dba04592b2164423
3
+ size 166615177
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ nltk
2
+ numpy
3
+ pandas
4
+ plotly
5
+ wordnet