hsaest commited on
Commit
9be4956
·
verified ·
1 Parent(s): 9179e9f

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. __pycache__/content.cpython-39.pyc +0 -0
  3. annotation/.DS_Store +0 -0
  4. annotation/src/__pycache__/utils.cpython-39.pyc +0 -0
  5. annotation/src/utils.py +186 -0
  6. app.py +197 -0
  7. content.py +70 -0
  8. database/.DS_Store +0 -0
  9. database/accommodations/.DS_Store +0 -0
  10. database/accommodations/clean_accommodations_2022.csv +0 -0
  11. database/attractions/attractions.csv +0 -0
  12. database/background/attractions.csv +0 -0
  13. database/background/citySet.txt +311 -0
  14. database/background/citySet_with_states.txt +312 -0
  15. database/background/clean_data.py +14 -0
  16. database/background/get_state_set.py +22 -0
  17. database/background/stateSet.txt +65 -0
  18. database/background/test.py +8 -0
  19. database/flights/.DS_Store +0 -0
  20. database/flights/clean_Flights_2022.csv +3 -0
  21. database/googleDistanceMatrix/clean_data.py +17 -0
  22. database/googleDistanceMatrix/distance.csv +0 -0
  23. database/googleDistanceMatrix/distance_org.csv +0 -0
  24. database/restaurants/.DS_Store +0 -0
  25. database/restaurants/clean_restaurant_2022.csv +0 -0
  26. evaluation/.DS_Store +0 -0
  27. evaluation/__pycache__/commonsenseConstraint.cpython-39.pyc +0 -0
  28. evaluation/__pycache__/eval.cpython-39.pyc +0 -0
  29. evaluation/__pycache__/hardConstraint.cpython-39.pyc +0 -0
  30. evaluation/commonsenseConstraint.py +735 -0
  31. evaluation/eval.py +181 -0
  32. evaluation/hardConstraint.py +266 -0
  33. evaluation/scored/1_validation_two-stage_1.jsonl +1 -0
  34. evaluation/scored/textbox_validation_two-stage_1.jsonl +1 -0
  35. requirements.txt +3 -0
  36. tools/__init__.py +0 -0
  37. tools/__pycache__/__init__.cpython-39.pyc +0 -0
  38. tools/accommodations/.ipynb_checkpoints/test-checkpoint.ipynb +0 -0
  39. tools/accommodations/__init__.py +0 -0
  40. tools/accommodations/__pycache__/__init__.cpython-39.pyc +0 -0
  41. tools/accommodations/__pycache__/apis.cpython-39.pyc +0 -0
  42. tools/accommodations/apis.py +91 -0
  43. tools/accommodations/test.ipynb +2037 -0
  44. tools/accommodations/test.py +12 -0
  45. tools/attractions/__pycache__/apis.cpython-39.pyc +0 -0
  46. tools/attractions/apis.py +34 -0
  47. tools/attractions/test.py +17 -0
  48. tools/cities/__pycache__/apis.cpython-39.pyc +0 -0
  49. tools/cities/apis.py +23 -0
  50. tools/cities/test.py +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ database/flights/clean_Flights_2022.csv filter=lfs diff=lfs merge=lfs -text
__pycache__/content.cpython-39.pyc ADDED
Binary file (4.84 kB). View file
 
annotation/.DS_Store ADDED
Binary file (8.2 kB). View file
 
annotation/src/__pycache__/utils.cpython-39.pyc ADDED
Binary file (6.95 kB). View file
 
annotation/src/utils.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import os
4
+ import gradio as gr
5
+
6
+ def load_line_json_data(filename):
7
+ data = []
8
+ with open(filename, 'r', encoding='utf-8') as f:
9
+ for line in f.read().strip().split('\n'):
10
+ unit = json.loads(line)
11
+ data.append(unit)
12
+ return data
13
+
14
+ def extract_query_number(query_string):
15
+ """
16
+ Extract the number from a query string formatted as "Query X" or "Query X --- Done".
17
+
18
+ Args:
19
+ - query_string (str): The input string.
20
+
21
+ Returns:
22
+ - int: The extracted number if found, else None.
23
+ """
24
+ pattern = r"Query (\d+)"
25
+ match = re.search(pattern, query_string)
26
+ return int(match.group(1)) if match else None
27
+
28
+ def create_data_display(css_content,data,annotation_idx):
29
+ return f"""
30
+ <style>
31
+ {css_content}
32
+ </style>
33
+ <div>
34
+ <span class="query-highlighted"><strong>Query {annotation_idx}:</strong> {data[annotation_idx-1]['query']}</span><br>
35
+ <span class="highlighted"><strong>Day:</strong> {data[annotation_idx-1]['days']}</span>
36
+ <span class="highlighted"><strong>Visiting City Number:</strong> {data[annotation_idx-1]['visiting_city_number']}</span>
37
+ <span class="highlighted"><strong>Date:</strong> {data[annotation_idx-1]['date']}</span>
38
+ <span class="highlighted"><strong>Departure:</strong> {data[annotation_idx-1]['org']}</span>
39
+ <span class="highlighted"><strong>Destination:</strong> {data[annotation_idx-1]['dest']}</span><br>
40
+ <span class="highlighted-alt"><strong>People Number:</strong> {data[annotation_idx-1]['people_number']}</span>
41
+ <span class="highlighted-alt"><strong>Budget:</strong> {data[annotation_idx-1]['budget']}</span>
42
+ <span class="highlighted-alt"><strong>Hotel Rule:</strong> {data[annotation_idx-1]['local_constraint']['house rule']}</span>
43
+ <span class="highlighted-alt"><strong>Cuisine:</strong> {data[annotation_idx-1]['local_constraint']['cuisine']}</span>
44
+ <span class="highlighted-alt"><strong>Room Type:</strong> {data[annotation_idx-1]['local_constraint']['room type']}</span>
45
+ <span class="highlighted-alt"><strong>Transportation:</strong> {data[annotation_idx-1]['local_constraint']['transportation']}</span><br>
46
+ </div>
47
+ """
48
+
49
+ def judge_valid_info(info):
50
+ if info == "" or not info or info == "You don't need to fill in the information for this or later days." :
51
+ return False
52
+ return True
53
+
54
+ def judge_submit_info(info, current_day, label, annotation_data, *tested_data):
55
+ if info == "" or not info:
56
+ raise gr.Error("Day {} {} is empty!".format(current_day, label))
57
+ if info != "-":
58
+ if label == "transportation":
59
+ if not judge_valid_transportation(info, annotation_data):
60
+ raise gr.Error("Day {} {} is invalid! Please note the transportation.".format(current_day, label))
61
+ elif label == "accommodation":
62
+ if not judge_valid_room_type(info, annotation_data, tested_data[0]):
63
+ raise gr.Error("Day {} {} is invalid! Please note the room type.".format(current_day, label))
64
+
65
+ if not judge_valid_room_rule(info, annotation_data, tested_data[0]):
66
+ raise gr.Error("Day {} {} is invalid! Please note the house rules.".format(current_day, label))
67
+
68
+ return True
69
+
70
+
71
+ def judge_valid_transportation(info, annotation_data):
72
+ if annotation_data['local_constraint']['transportation'] == 'no flight' and 'Flight' in info:
73
+ return False
74
+ elif annotation_data['local_constraint']['transportation'] == 'no self-driving' and 'Self-driving' in info:
75
+ return False
76
+ return True
77
+
78
+ def judge_valid_room_type(info, annotation_data, accommodation_data_all):
79
+ accommodation_data_filtered = get_filtered_data(info, accommodation_data_all)
80
+ if annotation_data['local_constraint']['room type'] == 'not shared room' and accommodation_data_filtered['room type'].values[0] == 'Shared room':
81
+ return False
82
+ # "shared room", "not shared room", "private room", "entire room"
83
+ elif annotation_data['local_constraint']['room type'] == 'shared room' and accommodation_data_filtered['room type'].values[0] != 'Shared room':
84
+ return False
85
+
86
+ elif annotation_data['local_constraint']['room type'] == 'private room' and accommodation_data_filtered['room type'].values[0] != 'Private room':
87
+ return False
88
+
89
+ elif annotation_data['local_constraint']['room type'] == 'entire room' and accommodation_data_filtered['room type'].values[0] != 'Entire home/apt':
90
+ return False
91
+
92
+ return True
93
+
94
+ def judge_valid_room_rule(info, annotation_data, accommodation_data_all):
95
+ accommodation_data_filtered = get_filtered_data(info, accommodation_data_all)
96
+ if annotation_data['local_constraint']['house rule'] == 'smoking' and 'No smoking' in str(accommodation_data_filtered['house_rules'].values[0]):
97
+ return False
98
+ if annotation_data['local_constraint']['house rule'] == 'parities' and 'No parties' in str(accommodation_data_filtered['house_rules'].values[0]):
99
+ return False
100
+ if annotation_data['local_constraint']['house rule'] == 'children under 10' and 'No children under 10' in str(accommodation_data_filtered['house_rules'].values[0]):
101
+ return False
102
+ if annotation_data['local_constraint']['house rule'] == 'visitors' and 'No visitors' in str(accommodation_data_filtered['house_rules'].values[0]):
103
+ return False
104
+ if annotation_data['local_constraint']['house rule'] == 'pets' and 'No pets' in str(accommodation_data_filtered['house_rules'].values[0]):
105
+ return False
106
+
107
+ return True
108
+
109
+ def judge_valid_cuisine(info, annotation_data, restaurant_data_all, cuisine_set: set):
110
+ if info != "-" and annotation_data['local_constraint']['cuisine'] is not None and annotation_data['org'] not in info:
111
+ restaurant_data_filtered = get_filtered_data(info, restaurant_data_all,('Name','City'))
112
+ for cuisine in annotation_data['local_constraint']['cuisine']:
113
+ if cuisine in restaurant_data_filtered.iloc[0]['Cuisines']:
114
+ cuisine_set.add(cuisine)
115
+ return cuisine_set
116
+
117
+
118
+
119
+
120
+ def get_valid_name_city(info):
121
+ # Modified the pattern to preserve spaces at the end of the name
122
+ pattern = r'(.*?),\s*([^,]+)(\(\w[\w\s]*\))?$'
123
+ match = re.search(pattern, info)
124
+ if match:
125
+ return match.group(1).strip(), extract_before_parenthesis(match.group(2).strip()).strip()
126
+ else:
127
+ print(f"{info} can not be parsed, '-' will be used instead.")
128
+ return "-","-"
129
+
130
+
131
+ def extract_numbers_from_filenames(directory):
132
+ # Define the pattern to match files
133
+ pattern = r'annotation_(\d+).json'
134
+
135
+ # List all files in the directory
136
+ files = os.listdir(directory)
137
+
138
+ # Extract numbers from filenames that match the pattern
139
+ numbers = [int(re.search(pattern, file).group(1)) for file in files if re.match(pattern, file)]
140
+
141
+ return numbers
142
+
143
+ def get_city_list(days, deparure_city, destination):
144
+ city_list = []
145
+ city_list.append(deparure_city)
146
+ if days == 3:
147
+ city_list.append(destination)
148
+ else:
149
+ city_set = open('../database/background/citySet_with_states.txt').read().split('\n')
150
+ state_city_map = {}
151
+ for unit in city_set:
152
+ city, state = unit.split('\t')
153
+ if state not in state_city_map:
154
+ state_city_map[state] = []
155
+ state_city_map[state].append(city)
156
+ for city in state_city_map[destination]:
157
+ if city != deparure_city:
158
+ city_list.append(city + f"({destination})")
159
+ return city_list
160
+
161
+ def get_filtered_data(component,data, column_name=('NAME','city')):
162
+ name, city = get_valid_name_city(component)
163
+ return data[(data[column_name[0]] == name) & (data[column_name[1]] == city)]
164
+
165
+ def extract_before_parenthesis(s):
166
+ match = re.search(r'^(.*?)\([^)]*\)', s)
167
+ return match.group(1) if match else s
168
+
169
+ def count_consecutive_values(lst):
170
+ if not lst:
171
+ return []
172
+
173
+ result = []
174
+ current_string = lst[0]
175
+ count = 1
176
+
177
+ for i in range(1, len(lst)):
178
+ if lst[i] == current_string:
179
+ count += 1
180
+ else:
181
+ result.append((current_string, count))
182
+ current_string = lst[i]
183
+ count = 1
184
+
185
+ result.append((current_string, count)) # Add the last group of values
186
+ return result
app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard/evaluation")))
4
+ sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard")))
5
+ os.chdir(os.path.dirname(os.path.abspath(__file__)))
6
+ import json
7
+ import datetime
8
+ from email.utils import parseaddr
9
+
10
+ import gradio as gr
11
+ import pandas as pd
12
+ import numpy as np
13
+
14
+ from datasets import load_dataset
15
+ from apscheduler.schedulers.background import BackgroundScheduler
16
+ from huggingface_hub import HfApi
17
+
18
+ # InfoStrings
19
+ # from scorer import question_scorer
20
+ from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
21
+ from evaluation.eval import eval_score
22
+
23
+ TOKEN = os.environ.get("TOKEN", None)
24
+
25
+ OWNER="osunlp"
26
+ DATA_DATASET = f"{OWNER}/TravelBench"
27
+ EVAL_DATASET = f"{OWNER}/TravelBenchEval"
28
+
29
+ api = HfApi()
30
+
31
+ YEAR_VERSION = "2024"
32
+
33
+ os.makedirs("scored", exist_ok=True)
34
+
35
+ # # Display the results
36
+ eval_results = load_dataset(EVAL_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
37
+ def get_dataframe_from_results(eval_results, split):
38
+ local_df = eval_results[split]
39
+ local_df = local_df.remove_columns(["Mail"])
40
+ df = pd.DataFrame(local_df)
41
+ df = df.sort_values(by=["Final Pass Rate"], ascending=False)
42
+ numeric_cols = [c for c in local_df.column_names if "Rate" in c]
43
+ df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
44
+ return df
45
+
46
+
47
+ eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
48
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
49
+
50
+
51
+
52
+ # def restart_space():
53
+ # api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
54
+
55
+
56
+ def load_line_json_data(filename):
57
+ data = []
58
+ with open(filename, 'r', encoding='utf-8') as f:
59
+ for line in f.read().strip().split('\n'):
60
+ unit = json.loads(line)
61
+ data.append(unit)
62
+ return data
63
+
64
+
65
+ def add_new_eval(
66
+ val_or_test: str,
67
+ eval_mode: str,
68
+ model: str,
69
+ planning_strategy: str,
70
+ organization: str,
71
+ mail: str,
72
+ path_to_file: str,
73
+ ):
74
+ # Very basic email parsing
75
+ _, parsed_mail = parseaddr(mail)
76
+ if not "@" in parsed_mail:
77
+ return format_warning("Please provide a valid email adress.")
78
+
79
+ print("Adding new eval")
80
+
81
+ if path_to_file is None:
82
+ return format_warning("Please attach a file.")
83
+
84
+ # Save submitted file
85
+ api.upload_file(
86
+ repo_id=EVAL_DATASET,
87
+ path_or_fileobj=path_to_file.name,
88
+ path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
89
+ repo_type="dataset",
90
+ token=TOKEN
91
+ )
92
+
93
+ # Compute score
94
+ file_path = path_to_file.name
95
+ result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
96
+ with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", "w") as scored_file:
97
+ scored_file.write(json.dumps(result) + "\n")
98
+
99
+ # Save scored file
100
+ api.upload_file(
101
+ repo_id=EVAL_DATASET,
102
+ path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl",
103
+ path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
104
+ repo_type="dataset",
105
+ token=TOKEN
106
+ )
107
+
108
+ # Actual submission
109
+ eval_entry = {
110
+ "Model": model,
111
+ "Planning Strategy": planning_strategy,
112
+ "Organization": organization,
113
+ "Mail": mail,
114
+ "Delivery Rate": result['Delivery Rate'],
115
+ "Commonsense Constraint Micro Pass Rate":result['Commonsense Constraint Micro Pass Rate'],
116
+ "Commonsense Constraint Macro Pass Rate":result['Commonsense Constraint Macro Pass Rate'],
117
+ "Hard Constraint Micro Pass Rate":result['Hard Constraint Micro Pass Rate'],
118
+ "Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
119
+ "Final Pass Rate":result['Final Pass Rate']
120
+ }
121
+
122
+ eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
123
+
124
+ print(eval_results)
125
+
126
+ eval_results.push_to_hub(EVAL_DATASET, config_name = 'scores', token=TOKEN)
127
+
128
+ return format_log(f"Model {model} submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
129
+
130
+
131
+ def refresh():
132
+ eval_results = load_dataset(EVAL_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
133
+ eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
134
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
135
+ return eval_dataframe_val, eval_dataframe_test
136
+
137
+ # def upload_file(files):
138
+ # file_paths = [file.name for file in files]
139
+ # return file_paths
140
+
141
+
142
+ demo = gr.Blocks()
143
+ with demo:
144
+ gr.HTML(TITLE)
145
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
146
+
147
+ with gr.Tab("Results: Validation"):
148
+ leaderboard_table_val = gr.components.Dataframe(
149
+ value=eval_dataframe_val, interactive=False,
150
+ )
151
+ with gr.Tab("Results: Test"):
152
+ leaderboard_table_test = gr.components.Dataframe(
153
+ value=eval_dataframe_test, interactive=False,
154
+ )
155
+
156
+ refresh_button = gr.Button("Refresh")
157
+ refresh_button.click(
158
+ refresh,
159
+ inputs=[],
160
+ outputs=[
161
+ leaderboard_table_val,
162
+ leaderboard_table_test,
163
+ ],
164
+ )
165
+ with gr.Accordion("Submit a new file for evaluation"):
166
+ with gr.Row():
167
+ with gr.Column():
168
+ level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
169
+ eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
170
+ model = gr.Textbox(label="Foundation Model")
171
+ planning_strategy = gr.Textbox(label="Planning Strategy")
172
+ with gr.Column():
173
+ organization = gr.Textbox(label="Organization")
174
+ mail = gr.Textbox(label="Contact email")
175
+ file_output = gr.File()
176
+
177
+
178
+ submit_button = gr.Button("Submit Eval")
179
+ submission_result = gr.Markdown()
180
+ submit_button.click(
181
+ add_new_eval,
182
+ [
183
+ level_of_test,
184
+ eval_mode,
185
+ model,
186
+ planning_strategy,
187
+ organization,
188
+ mail,
189
+ file_output,
190
+ ],
191
+ submission_result,
192
+ )
193
+
194
+ # scheduler = BackgroundScheduler()
195
+ # scheduler.add_job(restart_space, "interval", seconds=3600)
196
+ # scheduler.start()
197
+ demo.launch(debug=True)
content.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<h1 align="center" id="space-title">TravelBench Leaderboard</h1>"""
2
+
3
+ INTRODUCTION_TEXT = """
4
+ TravelBench is a benchmark crafted for evaluating language agents in tool-use and complex planning within multiple constraints. (See our [paper](https://arxiv.org/abs/2311.12983) for more details.)
5
+
6
+ ## Data
7
+ In TravelBench, for a given query, language agents are expected to formulate a comprehensive plan that includes transportation, daily meals, attractions, and accommodation for each day.
8
+ For constraints, from the perspective of real world applications, we design three types of them: Environment Constraint, Commonsense Constraint, and Hard Constraint.
9
+ TravelBench comprises 1,225 queries in total. The number of days and hard constraints are designed to test agents' abilities across both the breadth and depth of complex planning.
10
+
11
+ TravelBench data can be found in [this dataset](https://huggingface.co/datasets/osunlp/TravelBench).
12
+
13
+ ## Submission Guidelines for TravelBench
14
+ Participants are invited to submit results for both validation and testing phases. The submissions will be evaluated based on several metrics: delivery rate, commonsense constraint pass rate (micro/macro), hard constraint pass rate (micro/macro), and the final pass rate.
15
+
16
+ ### Format of Submission:
17
+ Submissions must be in the form of a JSON-line file. Each line should adhere to the following structure:
18
+ ```
19
+ {"idx":0,"query":"Natural Language Query","plan":[{"day": 1, "current_city": "from [City A] to [City B]", "transportation": "Flight Number: XXX, from A to B", "breakfast": "Name, City", "attraction": "Name, City;Name, City;...;Name, City;", "lunch": "Name, City", "dinner": "Name, City", "accommodation": "Name, City"}, {"day": 2, "current_city": "City B", "transportation": "-", "breakfast": "Name, City", "attraction": "Name, City;Name, City;", "lunch": "Name, City", "dinner": "Name, City", "accommodation": "Name, City"}, ...]}
20
+ ```
21
+ Explanation of Fields:
22
+ #### day:
23
+ Description: Indicates the specific day in the itinerary.
24
+ Format: Enter the numerical value representing the sequence of the day within the travel plan. For instance, '1' for the first day, '2' for the second day, and so on.
25
+
26
+ #### current city:
27
+ Description: Indicates the city where the traveler is currently located.
28
+ Format: When there is a change in location, use "from [City A] to [City B]" to denote the transition. If remaining in the same city, simply use the city's name (e.g., "City A").
29
+
30
+ #### transportation:
31
+ Description: Specifies the mode of transportation used.
32
+ Format: For flights, include the details in the format "Flight Number: XXX, from [City A] to [City B]". For self-driven or taxi travel, use "self-driving/taxi, from [City A] to [City B]". If there is no travel between cities on that day, use "-".
33
+
34
+ #### breakfast, lunch, and dinner:
35
+ Description: Details about dining arrangements.
36
+ Format: Use "Name, City" to specify the chosen restaurant and its location. If a meal is not planned, use "-".
37
+
38
+ #### attraction:
39
+ Description: Information about attractions visited.
40
+ Format: List attractions as "Name, City". If visiting multiple attractions, separate them with a semicolon ";". If no attraction is planned, use "-".
41
+
42
+ Please refer to [this](https://huggingface.co/datasets/osunlp/TravelBench/resolve/main/example_submission.jsonl?download=true) for example submission file.
43
+
44
+ Submission made by our team are labelled "TravelBench authors". Each submission will be automatically evaluated and scored based on the predefined metrics. The scores and rankings will be updated and displayed on the leaderboard.
45
+
46
+ """
47
+
48
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
49
+ CITATION_BUTTON_TEXT = r"""@misc{Xie2024TravelBench,
50
+ title={},
51
+ author={},
52
+ year={2024},
53
+ eprint={,
54
+ archivePrefix={arXiv},
55
+ primaryClass={cs.CL}
56
+ }"""
57
+
58
+
59
+ def format_error(msg):
60
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
61
+
62
+ def format_warning(msg):
63
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
64
+
65
+ def format_log(msg):
66
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
67
+
68
+ def model_hyperlink(link, model_name):
69
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
70
+
database/.DS_Store ADDED
Binary file (8.2 kB). View file
 
database/accommodations/.DS_Store ADDED
Binary file (6.15 kB). View file
 
database/accommodations/clean_accommodations_2022.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/attractions/attractions.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/background/attractions.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/background/citySet.txt ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ San Diego
2
+ Pellston
3
+ Buffalo
4
+ Charlotte Amalie
5
+ Flagstaff
6
+ Evansville
7
+ Hilo
8
+ Twin Falls
9
+ Newark
10
+ State College
11
+ Johnstown
12
+ Montgomery
13
+ Redding
14
+ Lynchburg
15
+ South Bend
16
+ Sarasota
17
+ Sioux Falls
18
+ Paducah
19
+ Kahului
20
+ Atlantic City
21
+ Bemidji
22
+ Toledo
23
+ Abilene
24
+ Sacramento
25
+ Amarillo
26
+ Moline
27
+ Hilton Head
28
+ Manhattan
29
+ Minneapolis
30
+ Fort Myers
31
+ Roswell
32
+ Harlingen
33
+ Seattle
34
+ Manchester
35
+ Gulfport
36
+ Gainesville
37
+ Pago Pago
38
+ Wrangell
39
+ Augusta
40
+ Waterloo
41
+ Yuma
42
+ Saipan
43
+ Christiansted
44
+ North Bend
45
+ Richmond
46
+ Albuquerque
47
+ Nashville
48
+ Aberdeen
49
+ Harrisburg
50
+ Fort Wayne
51
+ Green Bay
52
+ Wenatchee
53
+ Santa Fe
54
+ St. Petersburg
55
+ Belleville
56
+ Greensboro
57
+ Lake Charles
58
+ Traverse City
59
+ Erie
60
+ Niagara Falls
61
+ Pocatello
62
+ Idaho Falls
63
+ Alpena
64
+ Wilmington
65
+ Ontario
66
+ Iron Mountain
67
+ Lubbock
68
+ Helena
69
+ Kalamazoo
70
+ Cleveland
71
+ Grand Island
72
+ New Bern
73
+ Melbourne
74
+ Bristol
75
+ Orlando
76
+ Bismarck
77
+ Fresno
78
+ Billings
79
+ Daytona Beach
80
+ College Station
81
+ Jacksonville
82
+ Salt Lake City
83
+ Corpus Christi
84
+ Florence
85
+ Moab
86
+ Grand Forks
87
+ Las Vegas
88
+ Fairbanks
89
+ Petersburg
90
+ Wichita
91
+ Rhinelander
92
+ Kansas City
93
+ Dothan
94
+ Alamosa
95
+ Adak Island
96
+ Islip
97
+ Wichita Falls
98
+ Presque Isle
99
+ San Luis Obispo
100
+ Dayton
101
+ Fort Smith
102
+ Martha's Vineyard
103
+ Portland
104
+ Waco
105
+ New York
106
+ Columbus
107
+ Tampa
108
+ Little Rock
109
+ Kona
110
+ Clarksburg
111
+ San Angelo
112
+ Saginaw
113
+ Houston
114
+ Duluth
115
+ Valparaiso
116
+ Phoenix
117
+ Oakland
118
+ Watertown
119
+ Ogden
120
+ Cedar Rapids
121
+ Cape Girardeau
122
+ Sun Valley
123
+ Sault Ste. Marie
124
+ Trenton
125
+ Missoula
126
+ Pasco
127
+ Brainerd
128
+ Newburgh
129
+ Gustavus
130
+ Branson
131
+ Providence
132
+ Minot
133
+ Huntsville
134
+ San Antonio
135
+ Marquette
136
+ Owensboro
137
+ Del Rio
138
+ Portsmouth
139
+ Bloomington
140
+ Lexington
141
+ Santa Barbara
142
+ Baltimore
143
+ Panama City
144
+ Kodiak
145
+ Yakima
146
+ Vernal
147
+ Salisbury
148
+ Mission
149
+ Newport News
150
+ Charlottesville
151
+ Grand Junction
152
+ Baton Rouge
153
+ Beaumont
154
+ Staunton
155
+ Kalispell
156
+ Key West
157
+ Worcester
158
+ West Palm Beach
159
+ Boise
160
+ Grand Rapids
161
+ Salina
162
+ Fort Leonard Wood
163
+ Walla Walla
164
+ Everett
165
+ Dillingham
166
+ Lansing
167
+ Madison
168
+ Victoria
169
+ Sioux City
170
+ Hattiesburg
171
+ Stockton
172
+ Anchorage
173
+ Charlotte
174
+ Jamestown
175
+ Laramie
176
+ Decatur
177
+ Durango
178
+ Longview
179
+ Syracuse
180
+ St. Cloud
181
+ Santa Rosa
182
+ Bakersfield
183
+ North Platte
184
+ La Crosse
185
+ Plattsburgh
186
+ Concord
187
+ Atlanta
188
+ Provo
189
+ Ogdensburg
190
+ Ithaca
191
+ Colorado Springs
192
+ Washington
193
+ Williston
194
+ Tulsa
195
+ Midland
196
+ Champaign
197
+ Devils Lake
198
+ Greer
199
+ Muskegon
200
+ Hibbing
201
+ Santa Ana
202
+ Ponce
203
+ Prescott
204
+ Indianapolis
205
+ International Falls
206
+ Rapid City
207
+ Ketchikan
208
+ St. Louis
209
+ Santa Maria
210
+ Elmira
211
+ Alexandria
212
+ San Jose
213
+ Tucson
214
+ San Juan
215
+ Dubuque
216
+ Burbank
217
+ Gunnison
218
+ Cedar City
219
+ Hyannis
220
+ Raleigh
221
+ Norfolk
222
+ New Orleans
223
+ Medford
224
+ White Plains
225
+ Oklahoma City
226
+ Chicago
227
+ El Paso
228
+ Rockford
229
+ Aguadilla
230
+ Omaha
231
+ Scottsbluff
232
+ Yakutat
233
+ Arcata
234
+ Spokane
235
+ Brownsville
236
+ Bend
237
+ Hagerstown
238
+ Peoria
239
+ Appleton
240
+ Roanoke
241
+ Eugene
242
+ Rock Springs
243
+ Dodge City
244
+ Austin
245
+ Miami
246
+ Dallas
247
+ Mosinee
248
+ Killeen
249
+ Lihue
250
+ Pittsburgh
251
+ Tallahassee
252
+ Butte
253
+ Lawton
254
+ Honolulu
255
+ Greenville
256
+ Juneau
257
+ Myrtle Beach
258
+ Boston
259
+ Charleston
260
+ Latrobe
261
+ Knoxville
262
+ Denver
263
+ Bangor
264
+ Albany
265
+ Punta Gorda
266
+ Fort Lauderdale
267
+ Philadelphia
268
+ Binghamton
269
+ Great Falls
270
+ Shreveport
271
+ Asheville
272
+ Cheyenne
273
+ Milwaukee
274
+ Nome
275
+ Laredo
276
+ Des Moines
277
+ Fayetteville
278
+ Lewisburg
279
+ Fort Dodge
280
+ Cody
281
+ Chattanooga
282
+ Deadhorse
283
+ Kotzebue
284
+ Sitka
285
+ Bozeman
286
+ Palm Springs
287
+ Memphis
288
+ Nantucket
289
+ Texarkana
290
+ Lewiston
291
+ Valdosta
292
+ Birmingham
293
+ Scranton
294
+ Pensacola
295
+ Hancock
296
+ Los Angeles
297
+ Mason City
298
+ Savannah
299
+ West Yellowstone
300
+ Long Beach
301
+ Reno
302
+ Akron
303
+ Louisville
304
+ Hartford
305
+ Cincinnati
306
+ Rochester
307
+ San Francisco
308
+ Detroit
309
+ Monterey
310
+ Escanaba
311
+ Eau Claire
database/background/citySet_with_states.txt ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ San Diego California
2
+ Pellston Michigan
3
+ Buffalo New York
4
+ Charlotte Amalie St. Thomas
5
+ Flagstaff Arizona
6
+ Evansville Indiana
7
+ Hilo Hawaii
8
+ Twin Falls Idaho
9
+ Newark New Jersey
10
+ State College Pennsylvania
11
+ Johnstown Pennsylvania
12
+ Charleston South Carolina
13
+ Montgomery Alabama
14
+ Redding California
15
+ Lynchburg Virginia
16
+ South Bend Indiana
17
+ Sarasota Florida
18
+ Sioux Falls South Dakota
19
+ Paducah Kentucky
20
+ Kahului Hawaii
21
+ Atlantic City New Jersey
22
+ Bemidji Minnesota
23
+ Toledo Ohio
24
+ Abilene Texas
25
+ Sacramento California
26
+ Amarillo Texas
27
+ Moline Illinois
28
+ Hilton Head South Carolina
29
+ Manhattan New York
30
+ Minneapolis Minnesota
31
+ Fort Myers Florida
32
+ Roswell New Mexico
33
+ Harlingen Texas
34
+ Seattle Washington
35
+ Manchester England
36
+ Gulfport Mississippi
37
+ Gainesville Florida
38
+ Pago Pago Eastern District
39
+ Wrangell Alaska
40
+ Augusta Georgia
41
+ Waterloo Wallonia
42
+ Yuma Arizona
43
+ Saipan Saipan
44
+ Christiansted St. Croix
45
+ North Bend Oregon
46
+ Richmond Virginia
47
+ Albuquerque New Mexico
48
+ Nashville Tennessee
49
+ Aberdeen Scotland
50
+ Harrisburg Pennsylvania
51
+ Fort Wayne Indiana
52
+ Green Bay Wisconsin
53
+ Wenatchee Washington
54
+ Santa Fe New Mexico
55
+ St. Petersburg Saint Petersburg
56
+ Belleville Illinois
57
+ Greensboro North Carolina
58
+ Lake Charles Louisiana
59
+ Traverse City Michigan
60
+ Erie Pennsylvania
61
+ Niagara Falls New York
62
+ Pocatello Idaho
63
+ Idaho Falls Idaho
64
+ Alpena Michigan
65
+ Wilmington North Carolina
66
+ Ontario Ontario
67
+ Iron Mountain Michigan
68
+ Lubbock Texas
69
+ Helena Montana
70
+ Kalamazoo Michigan
71
+ Cleveland Ohio
72
+ Grand Island Nebraska
73
+ New Bern North Carolina
74
+ Melbourne Victoria
75
+ Bristol Tennessee
76
+ Orlando Florida
77
+ Bismarck North Dakota
78
+ Fresno California
79
+ Billings Montana
80
+ Jackson Mississippi
81
+ Daytona Beach Florida
82
+ College Station Texas
83
+ Jacksonville Florida
84
+ Salt Lake City Utah
85
+ Corpus Christi Texas
86
+ Florence Tuscany
87
+ Moab Utah
88
+ Grand Forks North Dakota
89
+ Las Vegas Nevada
90
+ Fairbanks Alaska
91
+ Petersburg Virginia
92
+ Wichita Kansas
93
+ Rhinelander Wisconsin
94
+ Kansas City Missouri
95
+ Dothan Alabama
96
+ Alamosa Colorado
97
+ Adak Island Alaska
98
+ Islip New York
99
+ Wichita Falls Texas
100
+ Presque Isle Maine
101
+ San Luis Obispo California
102
+ Dayton Ohio
103
+ Fort Smith Arkansas
104
+ Martha's Vineyard Massachusetts
105
+ Portland Oregon
106
+ Waco Texas
107
+ New York New York
108
+ Columbus Ohio
109
+ Tampa Florida
110
+ Little Rock Arkansas
111
+ Kona Hawaii
112
+ Clarksburg West Virginia
113
+ San Angelo Texas
114
+ Saginaw Michigan
115
+ Houston Texas
116
+ Duluth Minnesota
117
+ Valparaiso Indiana
118
+ Phoenix Arizona
119
+ Oakland California
120
+ Watertown New York
121
+ Ogden Utah
122
+ Cedar Rapids Iowa
123
+ Cape Girardeau Missouri
124
+ Sun Valley Idaho
125
+ Sault Ste. Marie Ontario
126
+ Trenton New Jersey
127
+ Missoula Montana
128
+ Pasco Washington
129
+ Brainerd Minnesota
130
+ Newburgh New York
131
+ Gustavus Minnesota
132
+ Branson Missouri
133
+ Providence Rhode Island
134
+ Minot North Dakota
135
+ Huntsville Alabama
136
+ San Antonio Texas
137
+ Marquette Wisconsin
138
+ Owensboro Kentucky
139
+ Del Rio Texas
140
+ Portsmouth England
141
+ Bloomington Illinois
142
+ Lexington Kentucky
143
+ Santa Barbara California
144
+ Baltimore Maryland
145
+ Panama City Florida
146
+ Kodiak Alaska
147
+ Yakima Washington
148
+ Vernal Utah
149
+ Salisbury Maryland
150
+ Mission Texas
151
+ Newport News Virginia
152
+ Charlottesville Virginia
153
+ Grand Junction Colorado
154
+ Baton Rouge Louisiana
155
+ Beaumont Texas
156
+ Staunton Virginia
157
+ Kalispell Montana
158
+ Key West Florida
159
+ Worcester England
160
+ West Palm Beach Florida
161
+ Boise Idaho
162
+ Grand Rapids Michigan
163
+ Salina Kansas
164
+ Fort Leonard Wood Missouri
165
+ Walla Walla Washington
166
+ Everett Washington
167
+ Dillingham Alaska
168
+ Lansing Michigan
169
+ Madison Wisconsin
170
+ Victoria Victoria
171
+ Sioux City Iowa
172
+ Hattiesburg Mississippi
173
+ Stockton California
174
+ Anchorage Alaska
175
+ Charlotte North Carolina
176
+ Jamestown Virginia
177
+ Laramie Wyoming
178
+ Decatur Georgia
179
+ Durango Colorado
180
+ Longview Texas
181
+ Syracuse New York
182
+ St. Cloud Minnesota
183
+ Santa Rosa California
184
+ Bakersfield California
185
+ North Platte Nebraska
186
+ La Crosse Wisconsin
187
+ Plattsburgh New York
188
+ Concord New Hampshire
189
+ Atlanta Georgia
190
+ Provo Utah
191
+ Ogdensburg New York
192
+ Ithaca New York
193
+ Colorado Springs Colorado
194
+ Washington District of Columbia
195
+ Williston North Dakota
196
+ Tulsa Oklahoma
197
+ Midland Texas
198
+ Champaign Illinois
199
+ Devils Lake Wisconsin
200
+ Greer South Carolina
201
+ Muskegon Michigan
202
+ Hibbing Minnesota
203
+ Santa Ana California
204
+ Ponce Ponce
205
+ Prescott Arizona
206
+ Indianapolis Indiana
207
+ International Falls Minnesota
208
+ Rapid City South Dakota
209
+ Ketchikan Alaska
210
+ St. Louis Missouri
211
+ Santa Maria California
212
+ Elmira New York
213
+ Alexandria Alexandria Governorate
214
+ San Jose California
215
+ Tucson Arizona
216
+ San Juan San Juan
217
+ Dubuque Iowa
218
+ Burbank California
219
+ Gunnison Colorado
220
+ Cedar City Utah
221
+ Hyannis Massachusetts
222
+ Raleigh North Carolina
223
+ Norfolk Virginia
224
+ New Orleans Louisiana
225
+ Medford Oregon
226
+ White Plains New York
227
+ Oklahoma City Oklahoma
228
+ Chicago Illinois
229
+ El Paso Texas
230
+ Rockford Illinois
231
+ Aguadilla Aguadilla
232
+ Omaha Nebraska
233
+ Scottsbluff Nebraska
234
+ Yakutat Alaska
235
+ Arcata California
236
+ Spokane Washington
237
+ Brownsville Texas
238
+ Bend Oregon
239
+ Hagerstown Maryland
240
+ Peoria Illinois
241
+ Appleton Wisconsin
242
+ Roanoke Virginia
243
+ Eugene Oregon
244
+ Rock Springs Wyoming
245
+ Dodge City Kansas
246
+ Austin Texas
247
+ Miami Florida
248
+ Dallas Texas
249
+ Mosinee Wisconsin
250
+ Killeen Texas
251
+ Lihue Hawaii
252
+ Pittsburgh Pennsylvania
253
+ Tallahassee Florida
254
+ Butte California
255
+ Lawton Oklahoma
256
+ Honolulu Hawaii
257
+ Greenville South Carolina
258
+ Juneau Alaska
259
+ Myrtle Beach South Carolina
260
+ Boston Massachusetts
261
+ Latrobe Pennsylvania
262
+ Knoxville Tennessee
263
+ Denver Colorado
264
+ Bangor Maine
265
+ Albany New York
266
+ Punta Gorda Florida
267
+ Fort Lauderdale Florida
268
+ Philadelphia Pennsylvania
269
+ Binghamton New York
270
+ Great Falls Montana
271
+ Shreveport Louisiana
272
+ Asheville North Carolina
273
+ Cheyenne Wyoming
274
+ Milwaukee Wisconsin
275
+ Nome Alaska
276
+ Laredo Texas
277
+ Des Moines Iowa
278
+ Fayetteville North Carolina
279
+ Lewisburg Pennsylvania
280
+ Fort Dodge Iowa
281
+ Cody Wyoming
282
+ Chattanooga Tennessee
283
+ Deadhorse Alaska
284
+ Kotzebue Alaska
285
+ Sitka Alaska
286
+ Bozeman Montana
287
+ Palm Springs California
288
+ Memphis Tennessee
289
+ Nantucket Massachusetts
290
+ Texarkana Texas
291
+ Lewiston Idaho
292
+ Valdosta Georgia
293
+ Birmingham England
294
+ Scranton Pennsylvania
295
+ Pensacola Florida
296
+ Hancock Michigan
297
+ Los Angeles California
298
+ Mason City Iowa
299
+ Savannah Georgia
300
+ West Yellowstone Montana
301
+ Long Beach California
302
+ Reno Nevada
303
+ Akron Ohio
304
+ Louisville Kentucky
305
+ Hartford Connecticut
306
+ Cincinnati Ohio
307
+ Rochester New York
308
+ San Francisco California
309
+ Detroit Michigan
310
+ Monterey California
311
+ Escanaba Michigan
312
+ Eau Claire Wisconsin
database/background/clean_data.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ with open('database/background/citySet.txt','r') as f:
2
+ city_set = f.read().strip().split('\n')
3
+
4
+ with open('database/background/citySet_with_states.txt','r') as f:
5
+ lines = f.read().strip().split('\n')
6
+ data = []
7
+ for unit in lines:
8
+ if unit.split('\t')[0] in city_set:
9
+ data.append(unit)
10
+
11
+ with open('database/background/citySet_with_states.txt','w') as f:
12
+ for unit in data:
13
+ f.write(unit + '\n')
14
+ f.close()
database/background/get_state_set.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # print now directory
3
+ print(os.getcwd())
4
+ state_set = set()
5
+ city_set = set()
6
+ with open('database/background/citySet_with_states.txt','r') as f:
7
+ city_set = f.read().strip().split('\n')
8
+ for city in city_set:
9
+ city_name = city.split('\t')[0]
10
+ state_name = city.split('\t')[1]
11
+ state_set.add(state_name)
12
+ city_set.add(city_name)
13
+ # write to new file
14
+ f.close()
15
+ # with open('database/background/stateSet.txt', 'a') as f:
16
+ # for state_name in state_set:
17
+ # f.write(state_name.split('\\')[0] + '\n')
18
+ # f.close()
19
+ with open('database/background/citySet_2.txt', 'a') as f:
20
+ for city_name in city_set:
21
+ f.write(city_name.split('\\')[0] + '\n')
22
+ f.close()
database/background/stateSet.txt ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Wallonia
2
+ St. Thomas
3
+ Alaska
4
+ Washington
5
+ Kansas
6
+ Scotland
7
+ Michigan
8
+ Eastern District
9
+ New Jersey
10
+ Utah
11
+ Alexandria Governorate
12
+ North Dakota
13
+ Connecticut
14
+ West Virginia
15
+ Aguadilla
16
+ North Carolina
17
+ Ohio
18
+ Colorado
19
+ Arkansas
20
+ New York
21
+ Mississippi
22
+ San Juan
23
+ Minnesota
24
+ California
25
+ Maine
26
+ Nebraska
27
+ Idaho
28
+ Alabama
29
+ Texas
30
+ Maryland
31
+ England
32
+ New Mexico
33
+ South Carolina
34
+ Montana
35
+ Ponce
36
+ Tennessee
37
+ Florida
38
+ Oklahoma
39
+ Hawaii
40
+ New Hampshire
41
+ Iowa
42
+ Oregon
43
+ Wyoming
44
+ Pennsylvania
45
+ Tuscany
46
+ Virginia
47
+ Indiana
48
+ Missouri
49
+ District of Columbia
50
+ Saint Petersburg
51
+ Nevada
52
+ Massachusetts
53
+ Louisiana
54
+ Wisconsin
55
+ Saipan
56
+ Ontario
57
+ St. Croix
58
+ Kentucky
59
+ South Dakota
60
+ Arizona
61
+ Georgia
62
+ Rhode Island
63
+ Illinois
64
+ None
65
+ Victoria
database/background/test.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ f = open('/home/xj/toolAugEnv/code/toolConstraint/database/background/citySet.txt','r').read().strip().split('\n')
3
+ citySet = []
4
+ for line in f:
5
+ if line not in citySet:
6
+ citySet.append(line.strip())
7
+ else:
8
+ print(line)
database/flights/.DS_Store ADDED
Binary file (6.15 kB). View file
 
database/flights/clean_Flights_2022.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dafdb0e3f8b79ce599a1e612a772865295bc226b46e5fb278368f7255b11cee
3
+ size 304807007
database/googleDistanceMatrix/clean_data.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ import csv
4
+
5
+ def extract_before_parenthesis(s):
6
+ match = re.search(r'^(.*?)\([^)]*\)', s)
7
+ return match.group(1) if match else s
8
+
9
+ if __name__ == '__main__':
10
+ data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/googleDistanceMatrix/distance.csv')
11
+ data = data.to_dict(orient = 'split')
12
+ fieldnames = ['origin', 'destination', 'cost', 'duration', 'distance']
13
+ with open('/home/xj/toolAugEnv/code/toolConstraint/database/googleDistanceMatrix/distance2.csv', 'w', newline='') as csvfile:
14
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
15
+ writer.writeheader()
16
+ for row in data['data']:
17
+ writer.writerow({'origin': extract_before_parenthesis(row[0]), 'destination': extract_before_parenthesis(row[1]), 'cost': row[2], 'duration': row[3], 'distance': row[4]})
database/googleDistanceMatrix/distance.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/googleDistanceMatrix/distance_org.csv ADDED
The diff for this file is too large to render. See raw diff
 
database/restaurants/.DS_Store ADDED
Binary file (6.15 kB). View file
 
database/restaurants/clean_restaurant_2022.csv ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/.DS_Store ADDED
Binary file (6.15 kB). View file
 
evaluation/__pycache__/commonsenseConstraint.cpython-39.pyc ADDED
Binary file (14 kB). View file
 
evaluation/__pycache__/eval.cpython-39.pyc ADDED
Binary file (7.05 kB). View file
 
evaluation/__pycache__/hardConstraint.cpython-39.pyc ADDED
Binary file (8.13 kB). View file
 
evaluation/commonsenseConstraint.py ADDED
@@ -0,0 +1,735 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from annotation.src.utils import get_valid_name_city,extract_before_parenthesis,extract_numbers_from_filenames
2
+ from tools.flights.apis import Flights
3
+ from tools.accommodations.apis import Accommodations
4
+ from tools.restaurants.apis import Restaurants
5
+ from tools.googleDistanceMatrix.apis import GoogleDistanceMatrix
6
+ from tools.attractions.apis import Attractions
7
+ import math
8
+ import json
9
+ import re
10
+ import os
11
+ import sys
12
+ from tqdm import tqdm
13
+ import argparse
14
+
15
+ sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
16
+ os.chdir(os.path.dirname(os.path.abspath(__file__)))
17
+
18
+ flight = Flights()
19
+ accommodation = Accommodations()
20
+ restaurants = Restaurants()
21
+ googleDistanceMatrix = GoogleDistanceMatrix()
22
+ attractions = Attractions()
23
+
24
+ city_state_set = open('../database/background/citySet_with_states.txt','r').read().split('\n')
25
+ city_state_map = {x:y for x,y in [unit.split('\t') for unit in city_state_set]}
26
+
27
+
28
+ def load_line_json_data(filename):
29
+ data = []
30
+ with open(filename, 'r', encoding='utf-8') as f:
31
+ for line in f.read().strip().split('\n'):
32
+ unit = json.loads(line)
33
+ data.append(unit)
34
+ return data
35
+
36
+
37
+ def count_consecutive_values(lst):
38
+ if not lst:
39
+ return []
40
+
41
+ result = []
42
+ current_string = lst[0]
43
+ count = 1
44
+
45
+ for i in range(1, len(lst)):
46
+ if lst[i] == current_string:
47
+ count += 1
48
+ else:
49
+ result.append((current_string, count))
50
+ current_string = lst[i]
51
+ count = 1
52
+
53
+ result.append((current_string, count)) # Add the last group of values
54
+ return result
55
+
56
+
57
+ def transportation_match(text: str):
58
+
59
+ if 'taxi' in text.lower():
60
+ return 'Taxi'
61
+
62
+ elif 'self-driving' in text.lower():
63
+ return 'Self-driving'
64
+
65
+ elif 'flight' in text.lower():
66
+ return 'Flight'
67
+
68
+
69
+ def extract_from_to(text: str):
70
+ """
71
+ Extracts 'A' and 'B' from the format "from A to B" in the given text, with B ending at a comma or the end of the string.
72
+
73
+ Args:
74
+ - text (str): The input string.
75
+
76
+ Returns:
77
+ - tuple: A tuple containing 'A' and 'B'. If no match is found, returns (None, None).
78
+ """
79
+ pattern = r"from\s+(.+?)\s+to\s+([^,]+)(?=[,\s]|$)"
80
+ matches = re.search(pattern, text)
81
+ return matches.groups() if matches else (None, None)
82
+
83
+
84
+
85
+ def is_valid_city_sequence(city_list):
86
+ """
87
+ Checks if the city sequence is valid. A valid sequence has every city (except the first and last)
88
+ appearing consecutively, and no city should appear again once its sequence is over.
89
+
90
+ Args:
91
+ - city_list (list): List of cities.
92
+
93
+ Returns:
94
+ - bool: True if the sequence is valid, False otherwise.
95
+ """
96
+
97
+ # If the list has less than 3 cities, it's invalid.
98
+ if len(city_list) < 3:
99
+ return False
100
+
101
+ # Set to keep track of visited cities
102
+ visited_cities = set()
103
+
104
+ i = 0
105
+ while i < len(city_list):
106
+ city = city_list[i]
107
+
108
+ # If the city was already visited, it's invalid.
109
+ if city in visited_cities and (i != 0 and i != len(city_list) - 1):
110
+ return False
111
+
112
+ # Count the consecutive occurrences of the city
113
+ count = 0
114
+ while i < len(city_list) and city_list[i] == city:
115
+ count += 1
116
+ i += 1
117
+
118
+ # If the city appeared only once in the medium, it's invalid.
119
+ if count == 1 and 0 < i - 1 < len(city_list) - 1:
120
+ return False
121
+
122
+ visited_cities.add(city)
123
+
124
+ return True
125
+
126
+
127
+
128
+ def is_reasonalbe_visiting_city(question, tested_data):
129
+
130
+ city_list = []
131
+
132
+ # print(tested_data)
133
+ for i in range(min(question['days'],len(tested_data))):
134
+ city_value = tested_data[i]['current_city']
135
+
136
+ if 'from' in city_value:
137
+ city1, city2 = extract_from_to(city_value)
138
+ city1 = extract_before_parenthesis(city1)
139
+ city2 = extract_before_parenthesis(city2)
140
+ if i==0 and city1 != question['org']:
141
+ return False, f"The first day's city should be {question['org']}."
142
+
143
+ city_list += [city1, city2]
144
+
145
+ else:
146
+ city_list.append(extract_before_parenthesis(city_value))
147
+
148
+ if city_list[0] != city_list[-1]:
149
+ return False, "The trip should be a closed circle."
150
+
151
+ if not is_valid_city_sequence(city_list):
152
+ return False, "The city sequence is invalid."
153
+
154
+ for idx, city in enumerate(city_list):
155
+ if city not in city_state_map:
156
+ return False, f"{city} is not a valid city."
157
+ if idx not in [0,len(city_list)-1] and question['days'] >3 and city_state_map[city] != question['dest']:
158
+ return False, f"{city} is not in {question['dest']}."
159
+
160
+ return True, None
161
+
162
+
163
+ def is_valid_restaurants(question, tested_data):
164
+
165
+ restaurants_list = []
166
+
167
+ for i in range(min(question['days'],len(tested_data))):
168
+ unit = tested_data[i]
169
+
170
+ if 'breakfast' in unit and unit['breakfast'] and unit['breakfast'] != '-':
171
+ if unit['breakfast'] not in restaurants_list:
172
+ restaurants_list.append(unit['breakfast'])
173
+ else:
174
+ return False, f"The restaurant in day {i+1} breakfast is repeated."
175
+ # elif 'breakfast' not in unit :
176
+ # return False, f"No Breakfast Info."
177
+
178
+ if 'lunch' in unit and unit['lunch'] and unit['lunch'] != '-':
179
+ if unit['lunch'] not in restaurants_list:
180
+ restaurants_list.append(unit['lunch'])
181
+ else:
182
+ return False, f"The restaurant in day {i+1} lunch {unit['lunch']} is repeated."
183
+ # elif 'lunch' not in unit:
184
+ # return False, f"No Lunch Info."
185
+
186
+ if 'dinner' in unit and unit['dinner'] and unit['dinner'] != '-':
187
+ if unit['dinner'] not in restaurants_list:
188
+ restaurants_list.append(unit['dinner'])
189
+ else:
190
+ return False, f"The restaurant in day {i+1} dinner is repeated."
191
+ # elif 'dinner' not in unit:
192
+ # return False, f"No Dinner Info."
193
+
194
+ return True, None
195
+
196
+ def is_valid_attractions(question, tested_data):
197
+
198
+ attractions_list = []
199
+
200
+ for i in range(min(question['days'],len(tested_data))):
201
+ unit = tested_data[i]
202
+
203
+ if 'attraction' in unit and unit['attraction'] and unit['attraction'] != '-':
204
+ for attraction in unit['attraction'].split(';')[:-1]:
205
+ if attraction not in attractions_list:
206
+ attractions_list.append(attraction)
207
+ else:
208
+ return False, f"The attraction '{attraction}' in day {i+1} is repeated."
209
+
210
+ # elif 'attraction' not in unit:
211
+ # return False, f"No Attraction Info."
212
+
213
+ return True, None
214
+
215
+ def is_valid_transportation(question, tested_data):
216
+
217
+ if tested_data[0]['transportation'] and tested_data[0]['transportation'] != '-':
218
+ transportation_list = [transportation_match(tested_data[0]['transportation'])]
219
+
220
+ else:
221
+ return False, "The transportation in day 1 should not be empty."
222
+
223
+ for i in range(min(question['days'],len(tested_data))):
224
+ unit = tested_data[i]
225
+
226
+ if 'transportation' in unit and unit['transportation'] and unit['transportation'] != '-':
227
+ transportation_list.append(transportation_match(unit['transportation']))
228
+ # elif 'transportation' not in unit:
229
+ # return False, f"No Transportation Info."
230
+
231
+ if (('Self-driving' in transportation_list) and ('Flight' in transportation_list)) or (('Taxi' in transportation_list) and ('Self-driving' in transportation_list)):
232
+ return False, "The transportation is conflicting."
233
+
234
+ return True, None
235
+
236
+ def is_valid_information_in_current_city(question, tested_data):
237
+
238
+ for i in range(min(question['days'],len(tested_data))):
239
+ unit = tested_data[i]
240
+ current_city = unit['current_city']
241
+ final_city_list = []
242
+
243
+ if 'from' in current_city:
244
+ city1, city2 = extract_from_to(current_city)
245
+ city1 = extract_before_parenthesis(city1)
246
+ city2 = extract_before_parenthesis(city2)
247
+ final_city_list = [city1, city2]
248
+ else:
249
+ final_city_list = extract_before_parenthesis(current_city)
250
+
251
+ if 'transportation' in unit and unit['transportation'] and unit['transportation'] != '-':
252
+ for city in final_city_list:
253
+ if city not in unit['transportation']:
254
+ # print(city)
255
+ return False, f"The transportation in day {i+1} is invalid city choice."
256
+ # elif 'transportation' not in unit:
257
+ # return False, f"No Transportation Info."
258
+
259
+ if 'breakfast' in unit and unit['breakfast'] and unit['breakfast'] != '-':
260
+
261
+ flag = False
262
+
263
+ for city in final_city_list:
264
+ if city in unit['breakfast']:
265
+ flag = True
266
+
267
+ if not flag:
268
+ return False, f"The breakfast in day {i+1} is invalid city choice."
269
+ # elif 'breakfast' not in unit:
270
+ # return False, f"No Breakfast Info."
271
+
272
+ if 'lunch' in unit and unit['lunch'] and unit['lunch'] != '-':
273
+ flag = False
274
+
275
+ for city in final_city_list:
276
+ if city in unit['lunch']:
277
+ flag = True
278
+
279
+ if not flag:
280
+ return False, f"The lunch in day {i+1} is invalid city choice."
281
+ # elif 'lunch' not in unit:
282
+ # return False, f"No Lunch Info."
283
+
284
+ if 'dinner' in unit and unit['dinner'] and unit['dinner'] != '-':
285
+ flag = False
286
+
287
+ for city in final_city_list:
288
+ if city in unit['dinner']:
289
+ flag = True
290
+
291
+ if not flag:
292
+ return False, f"The dinner in day {i+1} is invalid city choice."
293
+ # elif 'dinner' not in unit:
294
+ # return False, f"No Dinner Info."
295
+
296
+ if 'attraction' in unit and unit['attraction'] and unit['attraction'] != '-':
297
+
298
+ attraction_list = unit['attraction'].split(';')[:-1]
299
+
300
+ for attraction in attraction_list:
301
+ flag = False
302
+ for city in final_city_list:
303
+ if city in attraction:
304
+ flag = True
305
+ if not flag:
306
+ return False, f"The attraction in day {i+1} is invalid city choice."
307
+
308
+ # elif 'attraction' not in unit:
309
+ # return False, f"No Attraction Info."
310
+
311
+
312
+ if 'accommodation' in unit and unit['accommodation'] and unit['accommodation'] != '-':
313
+
314
+ if final_city_list[-1] not in unit['accommodation']:
315
+ return False, f"The accommodation in day {i+1} is invalid city choice."
316
+
317
+ # elif 'accommodation' not in unit:
318
+ # return False, f"No Accommodation Info."
319
+
320
+ return True, None
321
+
322
+ # hallucination
323
+ def is_valid_information_in_sandbox(question, tested_data):
324
+
325
+ for i in range(min(question['days'],len(tested_data))):
326
+ unit = tested_data[i]
327
+
328
+ if unit['transportation'] and unit['transportation'] != '-':
329
+ value = unit['transportation']
330
+ org_city, dest_city = extract_from_to(value)
331
+ if org_city == None or dest_city == None:
332
+ org_city, dest_city = extract_from_to(unit['current_city'])
333
+ if 'flight number' in value.lower():
334
+ try:
335
+ org_city = extract_before_parenthesis(org_city)
336
+ dest_city = extract_before_parenthesis(dest_city)
337
+ except TypeError:
338
+ raise ValueError("The transportation {} in day {} can not be parsed.".format(value,i+1))
339
+ # print(value)
340
+ if len(flight.data[(flight.data['Flight Number'] == value.split('Flight Number: ')[1].split(',')[0]) & (flight.data['OriginCityName']==org_city) & (flight.data['DestCityName']==dest_city)]) < 1:
341
+ return False, f"The flight number in day {i+1} is invalid in the sandbox."
342
+
343
+ elif 'self-driving' in value.lower() or 'taxi' in value.lower():
344
+ try:
345
+ org_city = extract_before_parenthesis(org_city)
346
+ dest_city = extract_before_parenthesis(dest_city)
347
+ except TypeError:
348
+ org_city = '-'
349
+ dest_city = '-'
350
+ print("The transportation {} in day {} can not be parsed and '-' will be used instead.".format(value,i+1))
351
+
352
+ if 'self-driving' in value.lower():
353
+ if googleDistanceMatrix.run_for_evaluation(org_city, dest_city, mode='self-driving')['cost'] == None:
354
+ return False, f"The self-driving in day {i+1} is invalid in the sandbox."
355
+ else:
356
+ if googleDistanceMatrix.run_for_evaluation(org_city, dest_city, mode='taxi')['cost'] == None:
357
+ return False, f"The taxi in day {i+1} is invalid in the sandbox."
358
+
359
+ if 'breakfast' in unit and unit['breakfast'] and unit['breakfast'] != '-':
360
+ name, city = get_valid_name_city(unit['breakfast'])
361
+ if len(restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]) < 1:
362
+ return False, f"The breakfast in day {i+1} is invalid in the sandbox."
363
+ # elif 'breakfast' not in unit:
364
+ # return False, f"No Breakfast Info."
365
+
366
+ if 'lunch' in unit and unit['lunch'] and unit['lunch'] != '-':
367
+ name, city = get_valid_name_city(unit['lunch'])
368
+ if len(restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]) < 1:
369
+ return False, f"The lunch in day {i+1} is invalid in the sandbox."
370
+ # elif 'lunch' not in unit:
371
+ # return False, f"No Lunch Info."
372
+
373
+ if 'dinner' in unit and unit['dinner'] and unit['dinner'] != '-':
374
+ name, city = get_valid_name_city(unit['dinner'])
375
+ if len(restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]) < 1:
376
+ return False, f"The dinner in day {i+1} is invalid in the sandbox."
377
+ # elif 'dinner' not in unit:
378
+ # return False, f"No Dinner Info."
379
+
380
+ if 'attraction' in unit and unit['attraction'] and unit['attraction'] != '-':
381
+ attractions_list = unit['attraction'].split(';')[:-1]
382
+ for attraction in attractions_list:
383
+ name, city = get_valid_name_city(attraction)
384
+ if len(attractions.data[(attractions.data['Name'].astype(str).str.contains(re.escape(name))) & (attractions.data['City'] == city)]) < 1:
385
+ return False, f"The attraction {attraction} in day {i+1} is invalid in the sandbox."
386
+ # elif 'attraction' not in unit:
387
+ # return False, f"No Attraction Info."
388
+
389
+ if 'accommodation' in unit and unit['accommodation'] and unit['accommodation'] != '-':
390
+ name, city = get_valid_name_city(unit['accommodation'])
391
+ # print(name,city)
392
+ # print(accommodation.data[accommodation.data['NAME'].astype(str).str.contains(re.escape(name))])
393
+ if len(accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]) < 1:
394
+ return False, f"The accommodation in day {i+1} is invalid in the sandbox."
395
+ # elif 'accommodation' not in unit:
396
+ # return False, f"No Accommodation Info."
397
+
398
+ return True, None
399
+
400
+
401
+ def is_valid_accommodaton(question, tested_data):
402
+ data = []
403
+ for i in range(min(question['days'],len(tested_data))):
404
+ unit = tested_data[i]
405
+
406
+ if 'accommodation' not in unit:
407
+ return False, f"No Accommodation Info."
408
+
409
+ data.append(unit['accommodation'])
410
+ # data = [unit['accommodation'] for unit in tested_data]
411
+ consectutive_accommodation = count_consecutive_values(data)
412
+ for unit in consectutive_accommodation:
413
+ # print(unit)
414
+ if unit and unit[0] not in ['-',''] :
415
+ name, city = get_valid_name_city(unit[0])
416
+ # print(unit[0],name,city)
417
+ # try:
418
+ if len(accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]) == 1 and unit[1] < accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)].iloc[0]['minimum nights']:
419
+ return False, f"The accommodation {unit[0]} do not obey the minumum nights rule."
420
+ # can not parse data
421
+ # except re.error:
422
+ # continue
423
+
424
+ return True, None
425
+
426
+ def is_valid_visiting_city_number(question, tested_data):
427
+
428
+ city_set = set()
429
+
430
+
431
+ for i in range(min(question['days'],len(tested_data))):
432
+ city_value = tested_data[i]['current_city']
433
+
434
+ if 'from' in city_value:
435
+ city1, city2 = extract_from_to(city_value)
436
+ city1 = extract_before_parenthesis(city1)
437
+ city2 = extract_before_parenthesis(city2)
438
+ if i==0 and city1 != question['org']:
439
+ return False, f"The first day's city should be {question['org']}."
440
+
441
+ city_set.add(city1)
442
+ city_set.add(city2)
443
+
444
+ else:
445
+ city_set.add(extract_before_parenthesis(city_value))
446
+
447
+ city_set.discard(question['org'])
448
+
449
+ if len(city_set) != question['visiting_city_number']:
450
+ return False, f"The number of visiting cities should be {question['visiting_city_number']}."
451
+
452
+ return True, None
453
+
454
+ def is_valid_days(question, tested_data):
455
+ lens = 0
456
+ for i in range(min(question['days'],len(tested_data))):
457
+ if tested_data[i] != {} and tested_data[i]['current_city'] != "You don't need to fill in the information for this or later days.":
458
+ lens += 1
459
+
460
+ if lens != question['days']:
461
+ # print(lens)
462
+ return False, f"The number of days should be {question['days']}."
463
+ else:
464
+ return True, None
465
+
466
+ def is_not_absent(question, tested_data):
467
+ needed_info = 6 * question['days']
468
+ total_valid_info = 0
469
+
470
+ if not is_valid_days(question, tested_data)[0]:
471
+ return False, "Invalid Days"
472
+
473
+ if not is_valid_visiting_city_number(question, tested_data)[0]:
474
+ return False, "Invalid City Number"
475
+
476
+ for i in range(min(question['days'],len(tested_data))):
477
+ unit = tested_data[i]
478
+
479
+ if 'transportation' not in unit:
480
+ return False, f"No Transportation Info."
481
+
482
+ if 'breakfast' not in unit:
483
+ return False, f"No Breakfast Info."
484
+
485
+ if 'lunch' not in unit:
486
+ return False, f"No Lunch Info."
487
+
488
+ if 'dinner' not in unit:
489
+ return False, f"No Dinner Info."
490
+
491
+ if 'attraction' not in unit:
492
+ return False, f"No Attraction Info."
493
+
494
+ if 'accommodation' not in unit:
495
+ return False, f"No Accommodation Info."
496
+
497
+ if ('from ' in unit['current_city'] or 'to ' in unit['current_city']) and unit['transportation'] in ['','-']:
498
+ return False, f"No transportation in day {i+1} is not allowed."
499
+
500
+ if ('from ' not in unit['current_city'] and ' to ' not in unit['current_city']) and unit['attraction'] in ['','-']:
501
+ return False, f"No attaction in day {i+1} is not allowed."
502
+
503
+ if i != question['days'] - 1 and unit['accommodation'] in ['','-']:
504
+ return False, f"No accommodation in day {i+1} is not allowed."
505
+
506
+ if (unit['breakfast'] in ['','-'] or unit['lunch'] in ['','-'] or unit['dinner'] in ['','-']) and 'from ' not in unit['current_city']:
507
+ return False, f"No meal in day {i+1} is not allowed."
508
+
509
+
510
+ for key in unit:
511
+ if unit[key] and unit[key] != '-':
512
+ total_valid_info += 1
513
+
514
+
515
+ if total_valid_info * 1.0 / needed_info < 0.5:
516
+ return False, f"The absent information is more than 50%."
517
+
518
+ return True, None
519
+
520
+
521
+ def evaluation(query_data, tested_data):
522
+ return_info = {}
523
+ return_info['is_reasonalbe_visiting_city'] = is_reasonalbe_visiting_city(query_data, tested_data)
524
+ return_info['is_valid_restaurants'] = is_valid_restaurants(query_data, tested_data)
525
+ return_info['is_valid_attractions'] = is_valid_attractions(query_data, tested_data)
526
+ return_info['is_valid_accommodation'] = is_valid_accommodaton(query_data, tested_data)
527
+ return_info['is_valid_transportation'] = is_valid_transportation(query_data, tested_data)
528
+ return_info['is_valid_information_in_current_city'] = is_valid_information_in_current_city(query_data, tested_data)
529
+ return_info['is_valid_information_in_sandbox'] = is_valid_information_in_sandbox(query_data, tested_data)
530
+ return_info['is_not_absent'] = is_not_absent(query_data, tested_data)
531
+ return return_info
532
+
533
+ def boolean_evaluation(query_data, tested_data):
534
+ return_info = {}
535
+ return_info['is_reasonalbe_visiting_city'] = is_reasonalbe_visiting_city(query_data, tested_data)
536
+ return_info['is_valid_restaurants'] = is_valid_restaurants(query_data, tested_data)
537
+ return_info['is_valid_accommodation'] = is_valid_accommodaton(query_data, tested_data)
538
+ return_info['is_valid_attractions'] = is_valid_attractions(query_data, tested_data)
539
+ return_info['is_valid_transportation'] = is_valid_transportation(query_data, tested_data)
540
+ return_info['is_valid_information_in_current_city'] = is_valid_information_in_current_city(query_data, tested_data)
541
+ return_info['is_valid_information_in_sandbox'] = is_valid_information_in_sandbox(query_data, tested_data)
542
+ return_info['is_not_absent'] = is_not_absent(query_data, tested_data)
543
+ for key in return_info:
544
+ if return_info[key][0] == False:
545
+ print(return_info[key][1])
546
+ return False
547
+ return True
548
+
549
+ # if __name__ == '__main__':
550
+ # number_list = extract_numbers_from_filenames('/home/xj/toolAugEnv/code/toolConstraint/data/annotation/lrz')
551
+ # # json_data = json.load(open('/home/xj/toolAugEnv/code/toolConstraint/data/annotation/x/annotation_4.json'))
552
+ # query_data = load_line_json_data('/home/xj/toolAugEnv/code/toolConstraint/data/query/lrz.jsonl')
553
+ # for idx in number_list:
554
+ # json_data = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/lrz/annotation_{idx}.json'))
555
+ # print(str(idx), evaluation(query_data[idx-1], json_data))
556
+ # # json_data = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/turbo16k-turbo16k/plan_{idx}.json'))
557
+ # # query_data = load_line_json_data('/home/xj/toolAugEnv/code/toolConstraint/data/query/test.jsonl')[idx-1]
558
+ # # help me write all function name in this file, just the name
559
+ # #
560
+ # # list all function name in this file
561
+ # # ['is_reasonalbe_visiting_city', 'is_valiable_restaurants', 'is_valiable_attractions', 'is_valiable_transportation', 'is_valid_information_in_current_city', 'is_valid_information_in_sandbox']
562
+ # # print(is_valiable_restaurants(query_data, json_data))
563
+
564
+ # if __name__ == "__main__":
565
+ # user = 'zk'
566
+ # query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/query/{user}.jsonl')
567
+ # idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
568
+ # commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
569
+ # for idx in idx_number_list:
570
+ # print(idx)
571
+ # query_data = query_data_list[idx-1]
572
+ # generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/turbo16k-turbo16k/{user}/plan_{idx}.json'))
573
+ # # generated_plan = generated_plan[:-1]
574
+ # if generated_plan[-1]['gpt-3.5-turbo-16k-result'] != 'Plan Fail':
575
+ # info_box = evaluation(query_data, generated_plan[-1]['gpt-3.5-turbo-16k-result'])
576
+ # generated_plan[-1]['toolAug-commonsense'] = info_box
577
+ # else:
578
+ # generated_plan[-1]['toolAug-commonsense'] = None
579
+ # info_box = None
580
+ # commonsense_statistic[query_data['level']][query_data['days']].append(info_box)
581
+ # with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/turbo16k-turbo16k/{user}/plan_{idx}.json','w') as f:
582
+ # json.dump(generated_plan,f)
583
+
584
+ # with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/turbo16k-turbo16k/{user}/commonsense_statistic.json','w') as f:
585
+ # json.dump(commonsense_statistic,f)
586
+
587
+ # if __name__ == "__main__":
588
+ # user = 'all'
589
+ # model_type = ['chatgpt','gpt4','greedy_search'][2]
590
+ # query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/query/{user}.jsonl')
591
+ # # idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
592
+ # idx_number_list = [i for i in range(1,501)]
593
+ # commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
594
+
595
+ # for idx in idx_number_list:
596
+ # print(idx)
597
+ # query_data = query_data_list[idx-1]
598
+ # generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/pre2/{user}/plan_{idx}.json'))
599
+ # # generated_plan = generated_plan[:-1]
600
+ # if model_type == 'greedy_search':
601
+ # info_box = evaluation(query_data, generated_plan[-1][f'greedy_search_plan'])
602
+ # else:
603
+ # info_box = evaluation(query_data, generated_plan[-1][f'{model_type}_human_collected_info_results_parsed'])
604
+ # generated_plan[-1][f'{model_type}_with_human_collected_commonsense'] = info_box
605
+ # commonsense_statistic[query_data['level']][query_data['days']].append(info_box)
606
+
607
+ # with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/pre2/{user}/plan_{idx}.json','w') as f:
608
+ # json.dump(generated_plan,f)
609
+
610
+ # with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/pre2/{user}/{model_type}_with_human_collected_commonsense_statistic.json','w') as f:
611
+ # json.dump(commonsense_statistic,f)
612
+
613
+
614
+ # if __name__ == "__main__":
615
+ # user = 'all'
616
+ # query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/query/{user}.jsonl')
617
+ # idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
618
+ # hardConstraint_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
619
+ # not_satified = []
620
+ # for idx in tqdm(idx_number_list):
621
+ # # print(idx)
622
+ # query_data = query_data_list[idx-1]
623
+ # generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}/annotation_{idx}.json'))
624
+
625
+ # if not boolean_evaluation(query_data, generated_plan):
626
+ # not_satified.append(idx)
627
+ # print(idx)
628
+ # generated_plan = generated_plan[:-1]
629
+ # print(not_satified)
630
+
631
+ if __name__ == "__main__":
632
+ set_type = ["train",'dev','test'][0]
633
+ query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{set_type}/query/query.jsonl')
634
+ # idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{set_type}/plan')
635
+ commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
636
+ not_satified = []
637
+ # print( idx_number_list)
638
+ for idx in tqdm(range(1,len(query_data_list)+1)):
639
+ # print(idx)
640
+ query_data = query_data_list[idx-1]
641
+ generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{set_type}/plan/plan_{idx}.json'))
642
+ try:
643
+ store_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{set_type}/plan_{idx}.json'))
644
+ except FileNotFoundError:
645
+ store_plan = [{}]
646
+ info_box = evaluation(query_data,generated_plan[1])
647
+ # if not boolean_evaluation(query_data, generated_plan[1]):
648
+ # not_satified.append(idx)
649
+ # print(idx)
650
+ # print(store_plan[-1])
651
+ store_plan[-1][f'human_anno_commonsense_constraint'] = info_box
652
+ with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{set_type}/plan_{idx}.json','w') as f:
653
+ json.dump(store_plan,f)
654
+ commonsense_statistic[query_data['level']][query_data['days']].append(info_box)
655
+ print(not_satified)
656
+ with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{set_type}/human_anno_commonsense_constraint.json','w') as f:
657
+ json.dump(commonsense_statistic,f)
658
+
659
+ # if __name__ == "__main__":
660
+ # user = 'all'
661
+ # model_type = ['chatgpt','gpt4'][1]
662
+ # query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/query/{user}.jsonl')
663
+ # # idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
664
+ # idx_number_list = [i for i in range(1,501)]
665
+ # commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
666
+ # cnt = 0
667
+ # for idx in idx_number_list:
668
+ # # print(idx)
669
+ # query_data = query_data_list[idx-1]
670
+ # generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/pre/{user}/plan_{idx}.json'))[-1]['gpt4_human_collected_info_results_parsed']
671
+ # # generated_plan = generated_plan[:-1]
672
+
673
+ # if not boolean_evaluation(query_data, generated_plan):
674
+ # cnt += 1
675
+ # print(idx)
676
+ # print(cnt)
677
+
678
+ # if __name__ == "__main__":
679
+ # parser = argparse.ArgumentParser(description="")
680
+ # # model_type = ['gpt-3.5-turbo-1106','gpt-4-1106-preview','greedy_search','mistral-7B-32K','gemini2','mixtral','gpt-3.5-turbo-11062'][-1]
681
+ # # method = ['direct','cot','react','reflexion','tool-use'][-1]
682
+ # # set_type = ['dev','test'][0]
683
+ # parser.add_argument("--model_type", type=str, default="gpt-3.5-turbo-1106")
684
+ # parser.add_argument("--method", type=str, default="direct")
685
+ # parser.add_argument("--set_type", type=str, default="dev")
686
+ # args = parser.parse_args()
687
+ # directory = f'/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{args.set_type}'
688
+ # query_data_list = load_line_json_data(os.path.join(directory, 'query/query.jsonl'))
689
+ # # idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
690
+ # idx_number_list = [i for i in range(1,len(query_data_list)+1)]
691
+ # commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
692
+ # deliver_cnt = 0
693
+ # if args.method == 'tool-use':
694
+ # suffix = ''
695
+ # else:
696
+ # suffix = '_with_human_info'
697
+ # for idx in tqdm(idx_number_list):
698
+ # # print(idx)
699
+ # query_data = query_data_list[idx-1]
700
+ # generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{args.set_type}/plan_{idx}.json'))
701
+ # # generated_plan = generated_plan[:-1]
702
+ # if args.model_type == 'greedy_search':
703
+ # info_box = evaluation(query_data, generated_plan[-1][f'greedy_search_plan'])
704
+ # else:
705
+ # if args.method == 'tool-use':
706
+ # suffix2 = ''
707
+ # else:
708
+ # suffix2 = '_collected'
709
+ # if generated_plan[-1][f'{args.model_type}_{args.method}{suffix2}_info_results'] and generated_plan[-1][f'{args.model_type}_{args.method}{suffix2}_info_results']!='Max Token Length Exceeded.':
710
+ # try:
711
+ # info_box = evaluation(query_data, generated_plan[-1][f'{args.model_type}_{args.method}{suffix}_results_parsed'])
712
+ # except KeyError:
713
+ # info_box = None
714
+ # generated_plan[-1][f'{args.model_type}_{args.method}{suffix2}_info_results'] = ""
715
+ # except IndexError:
716
+ # info_box = None
717
+ # generated_plan[-1][f'{args.model_type}_{args.method}{suffix2}_info_results'] = ""
718
+ # else:
719
+ # info_box = None
720
+ # if info_box:
721
+ # deliver_cnt += 1
722
+ # generated_plan[-1][f'{args.model_type}_{args.method}{suffix}_commonsense_constraint'] = info_box
723
+ # commonsense_statistic[query_data['level']][query_data['days']].append(info_box)
724
+
725
+ # with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{args.set_type}/plan_{idx}.json','w') as f:
726
+ # json.dump(generated_plan,f)
727
+
728
+ # with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{args.set_type}/{args.model_type}_{args.method}{suffix}_commonsense_constraint.json','w') as f:
729
+ # json.dump(commonsense_statistic,f)
730
+
731
+ # if args.set_type == 'dev':
732
+ # print(f"Model:{args.model_type} Method:{args.method} Set: {args.set_type} \nDeliver Rate: {deliver_cnt/180}" )
733
+ # elif args.set_type == 'test':
734
+ # print(f"Model:{args.model_type} Method:{args.method} Set: {args.set_type} \nDeliver Rate: {deliver_cnt/1000}" )
735
+
evaluation/eval.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from commonsenseConstraint import evaluation as commonsense_eval
2
+ from hardConstraint import evaluation as hard_eval
3
+ import json
4
+ from tqdm import tqdm
5
+ from datasets import load_dataset
6
+
7
+
8
+ def load_line_json_data(filename):
9
+ data = []
10
+ with open(filename, 'r', encoding='utf-8') as f:
11
+ for line in f.read().strip().split('\n'):
12
+ unit = json.loads(line)
13
+ data.append(unit)
14
+ return data
15
+
16
+ def count_true_false(data):
17
+ """Count the number of true and false values in a list."""
18
+ true_count = data.count(True)
19
+ false_count = data.count(False)
20
+ return true_count, false_count
21
+
22
+ def statistics(commonsense_statistic):
23
+ """Generate statistics for each level and day in the given data with a different structure."""
24
+ result = {level: {day: {} for day in commonsense_statistic[level]} for level in commonsense_statistic}
25
+
26
+ for level, days in commonsense_statistic.items():
27
+ for day, dicts in days.items():
28
+ for dct in dicts:
29
+ if dct:
30
+ for key, data in dct.items():
31
+ true_count, false_count = count_true_false(data)
32
+ if key not in result[level][day]:
33
+ result[level][day][key] = {"true": 0, "false": 0}
34
+ result[level][day][key]["true"] += true_count
35
+ result[level][day][key]["false"] += false_count
36
+
37
+ return result
38
+
39
+
40
+ def eval_score(validation_or_test: str, file_path: str, TOKEN):
41
+
42
+ if validation_or_test == 'validation':
43
+ query_data_list = load_dataset('osunlp/TravelBenchEval','validation',token=TOKEN)['validation']
44
+ elif validation_or_test == 'test':
45
+ query_data_list = load_dataset('osunlp/TravelBenchEval','test',token=TOKEN)['test']
46
+
47
+ query_data_list = [x for x in query_data_list]
48
+ hardConstraint_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
49
+ commonsenseConstraint_statistic = {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
50
+ tested_plans = load_line_json_data(file_path)
51
+ delivery_cnt = 0
52
+ plan_constraint_store = []
53
+ for idx in tqdm(range(0,len(query_data_list))):
54
+ query_data = query_data_list[idx]
55
+ tested_plan = tested_plans[idx]
56
+ if type(query_data) == str:
57
+ query_data = eval(query_data)
58
+ if type(tested_plan) == str:
59
+ tested_plan = eval(tested_plan)
60
+ if type(query_data['local_constraint']) == str:
61
+ query_data['local_constraint'] = eval(query_data['local_constraint'])
62
+
63
+ if tested_plan['plan']:
64
+ delivery_cnt += 1
65
+ commonsense_info_box = commonsense_eval(query_data,tested_plan['plan'])
66
+ else:
67
+ commonsense_info_box = None
68
+
69
+ if commonsense_info_box and commonsense_info_box['is_not_absent'][0] and commonsense_info_box['is_valid_information_in_sandbox'][0]:
70
+ hard_info_box = hard_eval(query_data,tested_plan['plan'])
71
+ else:
72
+ hard_info_box = None
73
+
74
+ plan_constraint_store.append({'commonsense_constraint':commonsense_info_box,'hard_constraint':hard_info_box})
75
+
76
+ commonsenseConstraint_statistic[query_data['level']][query_data['days']].append(commonsense_info_box)
77
+ hardConstraint_statistic[query_data['level']][query_data['days']].append(hard_info_box)
78
+
79
+ commonsenseConstraint_statistic_processed = statistics(commonsenseConstraint_statistic)
80
+ hardConstraint_statistic_processed = statistics(hardConstraint_statistic)
81
+ # print(commonsenseConstraint_statistic_processed)
82
+ # print(hardConstraint_statistic_processed)
83
+ constraint_record = {key: {day: {'house rule':0, 'cuisine':0, 'room type':0, 'transportation':0} for day in [3,5,7]} for key in ['medium','hard']}
84
+ constraint_mapping = {'house rule':'valid_room_rule','cuisine':'valid_cuisine','room type':'valid_room_type','transportation':'valid_transportation'}
85
+ mapping_constraint_record = {key: {day: {'valid_room_rule':0, 'valid_cuisine':0, 'valid_room_type':0, 'valid_transportation':0} for day in [3,5,7]} for key in ['medium','hard']}
86
+ count_record = {key:{day:0 for day in [3,5,7]} for key in ['easy','medium','hard']}
87
+
88
+ for unit in query_data_list:
89
+ count_record[unit['level']][unit['days']] += 1
90
+ for key in constraint_record['medium'][3]:
91
+ if unit['local_constraint'][key] != None:
92
+ constraint_record[unit['level']][unit['days']][key] += 1
93
+ mapping_constraint_record[unit['level']][unit['days']][constraint_mapping[key]] += 1
94
+
95
+ data_record = {key:{day:[] for day in [3,5,7]} for key in ['easy','medium','hard']}
96
+
97
+ constraint_dis_record = {"commonsense":{"pass":0,"total":0},"hard":{"pass":0,"total":0}}
98
+
99
+ for constraint in ['commonsense','hard']:
100
+ if constraint == 'commonsense':
101
+ constraint_statistic = commonsenseConstraint_statistic_processed
102
+ elif constraint == 'hard':
103
+ constraint_statistic = hardConstraint_statistic_processed
104
+
105
+ key_dict = {'commonsense':['is_valid_information_in_current_city','is_valid_information_in_sandbox','is_reasonalbe_visiting_city','is_valid_restaurants','is_valid_transportation','is_valid_attractions','is_valid_accommodation','is_not_absent'],'hard':['valid_cost','valid_room_rule','valid_cuisine','valid_room_type','valid_transportation']}
106
+
107
+ for key in constraint_statistic:
108
+ # level
109
+ for key2 in constraint_statistic[key]:
110
+ # day
111
+ # print(key2)
112
+ # key2 = eval(key2)
113
+ if key2 == -1:
114
+ print(constraint_statistic[key])
115
+ exit(0)
116
+ for key3 in key_dict[constraint]:
117
+ data_record[key][key2].append('0/0')
118
+ if key3 in constraint_statistic[key][key2]:
119
+ constraint_dis_record[constraint]['pass'] += constraint_statistic[key][key2][key3]['true']
120
+ if constraint == 'hard':
121
+ if key == 'hard' and key3 in ['valid_room_rule','valid_cuisine','valid_room_type','valid_transportation']:
122
+ data_record[key][key2][-1] = f"{constraint_statistic[key][key2][key3]['true']}/{mapping_constraint_record[key][key2][key3]}"
123
+ constraint_dis_record[constraint]['total'] += mapping_constraint_record[key][key2][key3]
124
+ elif key == 'medium' and key3 in ['valid_room_rule','valid_cuisine','valid_room_type']:
125
+ data_record[key][key2][-1] = f"{constraint_statistic[key][key2][key3]['true']}/{mapping_constraint_record[key][key2][key3]}"
126
+ constraint_dis_record[constraint]['total'] += mapping_constraint_record[key][key2][key3]
127
+ else:
128
+ data_record[key][key2][-1] = f"{constraint_statistic[key][key2][key3]['true']}/{count_record[key][key2]}"
129
+ if key3 in ['valid_cost','valid_visitng_city_number','valid_days']:
130
+ constraint_dis_record[constraint]['total'] += count_record[key][key2]
131
+ else:
132
+ data_record[key][key2][-1] = f"{constraint_statistic[key][key2][key3]['true']}/{count_record[key][key2]}"
133
+ constraint_dis_record[constraint]['total'] += count_record[key][key2]
134
+
135
+ final_all_cnt = 0
136
+ final_commonsense_cnt = 0
137
+ final_hardConstraint_cnt = 0
138
+ final_all_cnt_map = {level:0 for level in ['easy','medium','hard']}
139
+ for idx in (range(0,len(query_data_list))):
140
+ if plan_constraint_store[idx]['commonsense_constraint']:
141
+ final_commonsense_pass = True
142
+ final_hardConstraint_pass = True
143
+ for item in plan_constraint_store[idx]['commonsense_constraint']:
144
+ if plan_constraint_store[idx]['commonsense_constraint'][item][0] is not None and not plan_constraint_store[idx]['commonsense_constraint'][item][0]:
145
+ final_commonsense_pass = False
146
+ break
147
+ if plan_constraint_store[idx]['hard_constraint'] is None:
148
+ continue
149
+ for item in plan_constraint_store[idx]['hard_constraint']:
150
+ if plan_constraint_store[idx]['hard_constraint'][item][0] is not None and plan_constraint_store[idx]['hard_constraint'][item][0] == False:
151
+ final_hardConstraint_pass = False
152
+ break
153
+
154
+ if final_commonsense_pass:
155
+ final_commonsense_cnt += 1
156
+ if final_hardConstraint_pass:
157
+ final_hardConstraint_cnt += 1
158
+ if final_commonsense_pass and final_hardConstraint_pass:
159
+ final_all_cnt += 1
160
+ final_all_cnt_map[query_data_list[idx]['level']] += 1
161
+
162
+ result = {}
163
+
164
+ if validation_or_test == 'validation':
165
+ result['Delivery Rate'] = delivery_cnt / 180
166
+ result['Commonsense Constraint Micro Pass Rate'] = constraint_dis_record['commonsense']['pass'] / 1440
167
+ result['Commonsense Constraint Macro Pass Rate'] = final_commonsense_cnt / 180
168
+ result['Hard Constraint Micro Pass Rate'] = constraint_dis_record['hard']['pass'] / 420
169
+ result['Hard Constraint Macro Pass Rate'] = final_hardConstraint_cnt / 180
170
+ result['Final Pass Rate'] = final_all_cnt / 180
171
+
172
+ elif validation_or_test == 'test':
173
+ result['Delivery Rate'] = delivery_cnt / 1000
174
+ result['Commonsense Constraint Micro Pass Rate'] = constraint_dis_record['commonsense']['pass'] / 8000
175
+ result['Commonsense Constraint Macro Pass Rate'] = final_commonsense_cnt / 1000
176
+ result['Hard Constraint Micro Pass Rate'] = constraint_dis_record['hard']['pass'] / 2290
177
+ result['Hard Constraint Macro Pass Rate'] = final_hardConstraint_cnt / 1000
178
+ result['Final Pass Rate'] = final_all_cnt / 1000
179
+
180
+ return result
181
+
evaluation/hardConstraint.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from annotation.src.utils import get_valid_name_city,extract_before_parenthesis,extract_numbers_from_filenames
2
+ from tools.flights.apis import Flights
3
+ from tools.accommodations.apis import Accommodations
4
+ from tools.restaurants.apis import Restaurants
5
+ from tools.googleDistanceMatrix.apis import GoogleDistanceMatrix
6
+ from tools.attractions.apis import Attractions
7
+ import math
8
+ import json
9
+ import re
10
+ import numpy as np
11
+ import os
12
+ import sys
13
+ from tqdm import tqdm
14
+ import argparse
15
+
16
+ sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
17
+ os.chdir(os.path.dirname(os.path.abspath(__file__)))
18
+
19
+
20
+ flight = Flights()
21
+ accommodation = Accommodations()
22
+ restaurants = Restaurants()
23
+ googleDistanceMatrix = GoogleDistanceMatrix()
24
+ attractions = Attractions()
25
+
26
+
27
+ def load_line_json_data(filename):
28
+ data = []
29
+ with open(filename, 'r', encoding='utf-8') as f:
30
+ for line in f.read().strip().split('\n'):
31
+ unit = json.loads(line)
32
+ data.append(unit)
33
+ return data
34
+
35
+
36
+ def convert_bool_values(item):
37
+ if isinstance(item, dict):
38
+ # If the item is a dictionary, recurse on each value
39
+ return {key: convert_bool_values(value) for key, value in item.items()}
40
+ elif isinstance(item, list):
41
+ # If the item is a list, recurse on each item in the list
42
+ return [convert_bool_values(value) for value in item]
43
+ elif isinstance(item, tuple):
44
+ # If the item is a tuple, recurse on each item in the tuple and repackage as a tuple
45
+ return tuple(convert_bool_values(value) for value in item)
46
+ elif isinstance(item, np.bool_): # Here we check for numpy's bool_ type
47
+ # If the item is a numpy bool_, convert it to a standard Python bool
48
+ return bool(item)
49
+ else:
50
+ # If the item is any other type, return it unchanged
51
+ return item
52
+
53
+
54
+
55
+
56
+ def extract_from_to(text: str):
57
+ """
58
+ Extracts 'A' and 'B' from the format "from A to B" in the given text, with B ending at a comma or the end of the string.
59
+
60
+ Args:
61
+ - text (str): The input string.
62
+
63
+ Returns:
64
+ - tuple: A tuple containing 'A' and 'B'. If no match is found, returns (None, None).
65
+ """
66
+ pattern = r"from\s+(.+?)\s+to\s+([^,]+)(?=[,\s]|$)"
67
+ matches = re.search(pattern, text)
68
+ return matches.groups() if matches else (None, None)
69
+
70
+
71
+ def get_total_cost(question, tested_data):
72
+ total_cost = 0
73
+ for i in range(min(question['days'],len(tested_data))):
74
+ unit = tested_data[i]
75
+ # transporation
76
+ if unit['transportation'] and unit['transportation'] != '-':
77
+ value = unit['transportation']
78
+ org_city, dest_city = extract_from_to(value)
79
+ if org_city == None or dest_city == None:
80
+ org_city, dest_city = extract_from_to(unit['current_city'])
81
+
82
+ if org_city == None or dest_city == None:
83
+ pass
84
+ else:
85
+ if 'flight number' in value.lower():
86
+ res = flight.data[flight.data['Flight Number'] == value.split('Flight Number: ')[1].split(',')[0]]
87
+ if len(res) > 0:
88
+ total_cost += res['Price'].values[0] * question['people_number']
89
+
90
+ elif 'self-driving' in value.lower() or 'taxi' in value.lower():
91
+ if 'self-driving' in value.lower():
92
+ # print(org_city,dest_city)
93
+ cost = googleDistanceMatrix.run_for_evaluation(org_city,dest_city,'self-driving')['cost']
94
+ total_cost += cost * math.ceil(question['people_number'] * 1.0 / 5)
95
+ else:
96
+ cost = googleDistanceMatrix.run_for_evaluation(org_city,dest_city,'taxi')['cost']
97
+ total_cost += cost * math.ceil(question['people_number'] * 1.0 / 4)
98
+
99
+ # breakfast
100
+ if unit['breakfast'] and unit['breakfast'] != '-':
101
+ name, city = get_valid_name_city(unit['breakfast'])
102
+ res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
103
+ if len(res) > 0:
104
+ total_cost += res['Average Cost'].values[0] * question['people_number']
105
+
106
+
107
+ # lunch
108
+ if unit['lunch'] and unit['lunch'] != '-':
109
+ name, city = get_valid_name_city(unit['lunch'])
110
+ res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
111
+ if len(res) > 0:
112
+ total_cost += res['Average Cost'].values[0] * question['people_number']
113
+
114
+ # dinner
115
+ if unit['dinner'] and unit['dinner'] != '-':
116
+ name, city = get_valid_name_city(unit['dinner'])
117
+ res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
118
+ if len(res) > 0:
119
+ total_cost += res['Average Cost'].values[0] * question['people_number']
120
+
121
+ # accommodation
122
+ if unit['accommodation'] and unit['accommodation'] != '-':
123
+ name, city = get_valid_name_city(unit['accommodation'])
124
+ res = accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]
125
+ if len(res) > 0:
126
+ total_cost += res['price'].values[0] * math.ceil(question['people_number'] * 1.0 / res['maximum occupancy'].values[0])
127
+ # print(total_cost)
128
+ return total_cost
129
+
130
+
131
+ def is_valid_room_rule(question, tested_data):
132
+
133
+ if question['local_constraint']['house rule'] is None:
134
+ return None,None
135
+
136
+ for i in range(min(question['days'],len(tested_data))):
137
+ unit = tested_data[i]
138
+ if unit['accommodation'] and unit['accommodation'] != '-':
139
+ name, city = get_valid_name_city(unit['accommodation'])
140
+ res = accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]
141
+ if len(res) > 0:
142
+ if question['local_constraint']['house rule'] == 'smoking' and 'No smoking' in str(res['house_rules'].values[0]):
143
+ return False, f"The house rule should be {question['local_constraint']['house rule']}."
144
+ if question['local_constraint']['house rule'] == 'parities' and 'No parties' in str(res['house_rules'].values[0]):
145
+ return False, f"The house rule should be {question['local_constraint']['house rule']}."
146
+ if question['local_constraint']['house rule'] == 'children under 10' and 'No children under 10' in str(res['house_rules'].values[0]):
147
+ return False, f"The house rule should be {question['local_constraint']['house rule']}."
148
+ if question['local_constraint']['house rule'] == 'visitors' and 'No visitors' in str(res['house_rules'].values[0]):
149
+ return False, f"The house rule should be {question['local_constraint']['house rule']}."
150
+ if question['local_constraint']['house rule'] == 'pets' and 'No pets' in str(res['house_rules'].values[0]):
151
+ return False, f"The house rule should be {question['local_constraint']['house rule']}."
152
+
153
+
154
+ return True, None
155
+
156
+
157
+
158
+ def is_valid_cuisine(question, tested_data):
159
+ cuisine_set = set()
160
+ if question['local_constraint']['cuisine']:
161
+ for i in range(min(question['days'],len(tested_data))):
162
+ unit = tested_data[i]
163
+
164
+ if unit['breakfast'] and unit['breakfast'] != '-':
165
+ name, city = get_valid_name_city(unit['breakfast'])
166
+ if city == question['org']:
167
+ continue
168
+ res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
169
+ if len(res) > 0:
170
+ for cuisine in question['local_constraint']['cuisine']:
171
+ if cuisine in res.iloc[0]['Cuisines']:
172
+ cuisine_set.add(cuisine)
173
+
174
+ if unit['lunch'] and unit['lunch'] != '-':
175
+ name, city = get_valid_name_city(unit['lunch'])
176
+ if city == question['org']:
177
+ continue
178
+ res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
179
+ if len(res) > 0:
180
+ for cuisine in question['local_constraint']['cuisine']:
181
+ if cuisine in res.iloc[0]['Cuisines']:
182
+ cuisine_set.add(cuisine)
183
+
184
+ if unit['dinner'] and unit['dinner'] != '-':
185
+ name, city = get_valid_name_city(unit['dinner'])
186
+ if city == question['org']:
187
+ continue
188
+ res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
189
+ if len(res) > 0:
190
+ for cuisine in question['local_constraint']['cuisine']:
191
+ if cuisine in res.iloc[0]['Cuisines']:
192
+ cuisine_set.add(cuisine)
193
+
194
+ if len(cuisine_set) == len(question['local_constraint']['cuisine']):
195
+ return True, None
196
+ else:
197
+ # judge which cuisine is not satisfied
198
+ for cuisine in question['local_constraint']['cuisine']:
199
+ if cuisine not in cuisine_set:
200
+ return False, f"The cuisine {cuisine} is not satisfied."
201
+ # return False, f"The cuisine should be {question['local_constraint']['cuisine']}."
202
+ else:
203
+ return None,None
204
+
205
+
206
+ def is_valid_transportation(question, tested_data):
207
+ if question['local_constraint']['transportation'] is None:
208
+ return None,None
209
+ for i in range(min(question['days'],len(tested_data))):
210
+ unit = tested_data[i]
211
+ if unit['transportation'] and unit['transportation'] != '-':
212
+ value = unit['transportation']
213
+ if question['local_constraint']['transportation'] == 'no flight' and 'Flight' in value:
214
+ return False, f"The transportation should not be {question['local_constraint']['transportation']}."
215
+ elif question['local_constraint']['transportation'] == 'no self-driving' and 'Self-driving' in value:
216
+ return False, f"The transportation should not be {question['local_constraint']['transportation']}."
217
+
218
+ return True, None
219
+
220
+
221
+ def is_valid_room_type(question, tested_data):
222
+ if question['local_constraint']['room type'] is None:
223
+ return None,None
224
+ for i in range(min(question['days'],len(tested_data))):
225
+ unit = tested_data[i]
226
+ if unit['accommodation'] and unit['accommodation'] != '-':
227
+ name, city = get_valid_name_city(unit['accommodation'])
228
+ res = accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]
229
+ if len(res) > 0:
230
+ if question['local_constraint']['room type'] == 'not shared room' and res['room type'].values[0] == 'Shared room':
231
+ return False, f"The room type should be {question['local_constraint']['room type']}."
232
+ # "shared room", "not shared room", "private room", "entire room"
233
+ elif question['local_constraint']['room type'] == 'shared room' and res['room type'].values[0] != 'Shared room':
234
+ return False, f"The room type should be {question['local_constraint']['room type']}."
235
+
236
+ elif question['local_constraint']['room type'] == 'private room' and res['room type'].values[0] != 'Private room':
237
+ return False, f"The room type should be {question['local_constraint']['room type']}."
238
+
239
+ elif question['local_constraint']['room type'] == 'entire room' and res['room type'].values[0] != 'Entire home/apt':
240
+ return False, f"The room type should be {question['local_constraint']['room type']}."
241
+
242
+ return True, None
243
+
244
+
245
+ def evaluation(query_data, tested_data):
246
+ return_info = {}
247
+ return_info['valid_cuisine'] = is_valid_cuisine(query_data, tested_data)
248
+ return_info['valid_room_rule'] = is_valid_room_rule(query_data, tested_data)
249
+ return_info['valid_transportation'] = is_valid_transportation(query_data, tested_data)
250
+ return_info['valid_room_type'] = is_valid_room_type(query_data, tested_data)
251
+ return_info['valid_cost'] = (bool(get_total_cost(query_data, tested_data) <= query_data['budget']), None)
252
+ return return_info
253
+
254
+ def boolean_evaluation(query_data, tested_data):
255
+ return_info = {}
256
+ return_info['valid_cuisine'] = is_valid_cuisine(query_data, tested_data)
257
+ return_info['valid_room_rule'] = is_valid_room_rule(query_data, tested_data)
258
+ return_info['valid_transportation'] = is_valid_transportation(query_data, tested_data)
259
+ return_info['valid_room_type'] = is_valid_room_type(query_data, tested_data)
260
+ return_info['valid_cost'] = (bool(get_total_cost(query_data, tested_data) <= query_data['budget']), None)
261
+ for key in return_info:
262
+ if return_info[key][0] == False:
263
+ print(key)
264
+ return False
265
+ return True
266
+
evaluation/scored/1_validation_two-stage_1.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"Delivery Rate": 0.8944444444444445, "Commonsense Constraint Micro Pass Rate": 0.6111111111111112, "Commonsense Constraint Macro Pass Rate": 0.027777777777777776, "Hard Constraint Micro Pass Rate": 0.1523809523809524, "Hard Constraint Macro Pass Rate": 0.10555555555555556, "Final Pass Rate": 0.005555555555555556}
evaluation/scored/textbox_validation_two-stage_1.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"Delivery Rate": 0.8944444444444445, "Commonsense Constraint Micro Pass Rate": 0.6111111111111112, "Commonsense Constraint Macro Pass Rate": 0.027777777777777776, "Hard Constraint Micro Pass Rate": 0.1523809523809524, "Hard Constraint Macro Pass Rate": 0.10555555555555556, "Final Pass Rate": 0.005555555555555556}
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ datasets==2.16.1
2
+ gradio==3.50.2
3
+ huggingface-hub==0.20.2
tools/__init__.py ADDED
File without changes
tools/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (170 Bytes). View file
 
tools/accommodations/.ipynb_checkpoints/test-checkpoint.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
tools/accommodations/__init__.py ADDED
File without changes
tools/accommodations/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (185 Bytes). View file
 
tools/accommodations/__pycache__/apis.cpython-39.pyc ADDED
Binary file (1.57 kB). View file
 
tools/accommodations/apis.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pandas import DataFrame
3
+ from typing import Optional
4
+ from annotation.src.utils import extract_before_parenthesis
5
+
6
+
7
+ class Accommodations:
8
+ def __init__(self, path="../database/accommodations/clean_accommodations_2022.csv"):
9
+ self.path = path
10
+ self.data = pd.read_csv(self.path).dropna()[['NAME','price','room type', 'house_rules', 'minimum nights', 'maximum occupancy', 'review rate number', 'city']]
11
+ print("Accommodations loaded.")
12
+
13
+ def load_db(self):
14
+ self.data = pd.read_csv(self.path).dropna()
15
+
16
+ def run(self,
17
+ city: str,
18
+ ) -> DataFrame:
19
+ """Search for accommodations by city."""
20
+ results = self.data[self.data["city"] == city]
21
+ # results = results[results["date"] == date]
22
+ # if order == "ascPrice":
23
+ # results = results.sort_values(by=["price"], ascending=True)
24
+ # elif order == "descPrice":
25
+ # results = results.sort_values(by=["price"], ascending=False)
26
+ # elif order == "ascRate":
27
+ # results = results.sort_values(by=["review rate number"], ascending=True)
28
+ # elif order == "descRate":
29
+ # results = results.sort_values(by=["review rate number"], ascending=False)
30
+ # elif order == "ascMinumNights":
31
+ # results = results.sort_values(by=["minimum nights"], ascending=True)
32
+ # elif order == "descMinumNights":
33
+ # results = results.sort_values(by=["minimum nights"], ascending=False)
34
+ # elif order == "ascMaximumOccupancy":
35
+ # results = results.sort_values(by=["maximum occupancy"], ascending=True)
36
+ # elif order == "descMaximumOccupancy":
37
+ # results = results.sort_values(by=["maximum occupancy"], ascending=False)
38
+
39
+ # if room_type == "all":
40
+ # return results
41
+ # elif room_type == "Entire home/apt":
42
+ # return results[results["room type"]=="Entire home/apt"]
43
+ # elif room_type == "Hotel room":
44
+ # return results[results["room type"]=="Hotel room"]
45
+ # elif room_type == "Private room":
46
+ # return results[results["room type"]=="Private room"]
47
+ # elif room_type == "Shared room":
48
+ # return results[results["room type"]=="Shared room"]
49
+ # else:
50
+ # return None
51
+ if len(results) == 0:
52
+ return "There is no attraction in this city."
53
+
54
+ return results
55
+
56
+ def run_for_annotation(self,
57
+ city: str,
58
+ ) -> DataFrame:
59
+ """Search for accommodations by city."""
60
+ results = self.data[self.data["city"] == extract_before_parenthesis(city)]
61
+ # results = results[results["date"] == date]
62
+ # if order == "ascPrice":
63
+ # results = results.sort_values(by=["price"], ascending=True)
64
+ # elif order == "descPrice":
65
+ # results = results.sort_values(by=["price"], ascending=False)
66
+ # elif order == "ascRate":
67
+ # results = results.sort_values(by=["review rate number"], ascending=True)
68
+ # elif order == "descRate":
69
+ # results = results.sort_values(by=["review rate number"], ascending=False)
70
+ # elif order == "ascMinumNights":
71
+ # results = results.sort_values(by=["minimum nights"], ascending=True)
72
+ # elif order == "descMinumNights":
73
+ # results = results.sort_values(by=["minimum nights"], ascending=False)
74
+ # elif order == "ascMaximumOccupancy":
75
+ # results = results.sort_values(by=["maximum occupancy"], ascending=True)
76
+ # elif order == "descMaximumOccupancy":
77
+ # results = results.sort_values(by=["maximum occupancy"], ascending=False)
78
+
79
+ # if room_type == "all":
80
+ # return results
81
+ # elif room_type == "Entire home/apt":
82
+ # return results[results["room type"]=="Entire home/apt"]
83
+ # elif room_type == "Hotel room":
84
+ # return results[results["room type"]=="Hotel room"]
85
+ # elif room_type == "Private room":
86
+ # return results[results["room type"]=="Private room"]
87
+ # elif room_type == "Shared room":
88
+ # return results[results["room type"]=="Shared room"]
89
+ # else:
90
+ # return None
91
+ return results
tools/accommodations/test.ipynb ADDED
@@ -0,0 +1,2037 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "ad7592e7",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "/tmp/ipykernel_2459435/230780042.py:2: DtypeWarning: Columns (25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
14
+ " data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/Airbnb_Open_Data.csv')\n"
15
+ ]
16
+ }
17
+ ],
18
+ "source": [
19
+ "import pandas as pd\n",
20
+ "data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/Airbnb_Open_Data.csv')"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "id": "f97916a9",
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "data": {
31
+ "text/html": [
32
+ "<div>\n",
33
+ "<style scoped>\n",
34
+ " .dataframe tbody tr th:only-of-type {\n",
35
+ " vertical-align: middle;\n",
36
+ " }\n",
37
+ "\n",
38
+ " .dataframe tbody tr th {\n",
39
+ " vertical-align: top;\n",
40
+ " }\n",
41
+ "\n",
42
+ " .dataframe thead th {\n",
43
+ " text-align: right;\n",
44
+ " }\n",
45
+ "</style>\n",
46
+ "<table border=\"1\" class=\"dataframe\">\n",
47
+ " <thead>\n",
48
+ " <tr style=\"text-align: right;\">\n",
49
+ " <th></th>\n",
50
+ " <th>id</th>\n",
51
+ " <th>NAME</th>\n",
52
+ " <th>host id</th>\n",
53
+ " <th>host_identity_verified</th>\n",
54
+ " <th>host name</th>\n",
55
+ " <th>neighbourhood group</th>\n",
56
+ " <th>neighbourhood</th>\n",
57
+ " <th>lat</th>\n",
58
+ " <th>long</th>\n",
59
+ " <th>country</th>\n",
60
+ " <th>...</th>\n",
61
+ " <th>service fee</th>\n",
62
+ " <th>minimum nights</th>\n",
63
+ " <th>number of reviews</th>\n",
64
+ " <th>last review</th>\n",
65
+ " <th>reviews per month</th>\n",
66
+ " <th>review rate number</th>\n",
67
+ " <th>calculated host listings count</th>\n",
68
+ " <th>availability 365</th>\n",
69
+ " <th>house_rules</th>\n",
70
+ " <th>license</th>\n",
71
+ " </tr>\n",
72
+ " </thead>\n",
73
+ " <tbody>\n",
74
+ " <tr>\n",
75
+ " <th>0</th>\n",
76
+ " <td>1001254</td>\n",
77
+ " <td>Clean &amp; quiet apt home by the park</td>\n",
78
+ " <td>80014485718</td>\n",
79
+ " <td>unconfirmed</td>\n",
80
+ " <td>Madaline</td>\n",
81
+ " <td>Brooklyn</td>\n",
82
+ " <td>Kensington</td>\n",
83
+ " <td>40.64749</td>\n",
84
+ " <td>-73.97237</td>\n",
85
+ " <td>United States</td>\n",
86
+ " <td>...</td>\n",
87
+ " <td>$193</td>\n",
88
+ " <td>10.0</td>\n",
89
+ " <td>9.0</td>\n",
90
+ " <td>10/19/2021</td>\n",
91
+ " <td>0.21</td>\n",
92
+ " <td>4.0</td>\n",
93
+ " <td>6.0</td>\n",
94
+ " <td>286.0</td>\n",
95
+ " <td>Clean up and treat the home the way you'd like...</td>\n",
96
+ " <td>NaN</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>1</th>\n",
100
+ " <td>1002102</td>\n",
101
+ " <td>Skylit Midtown Castle</td>\n",
102
+ " <td>52335172823</td>\n",
103
+ " <td>verified</td>\n",
104
+ " <td>Jenna</td>\n",
105
+ " <td>Manhattan</td>\n",
106
+ " <td>Midtown</td>\n",
107
+ " <td>40.75362</td>\n",
108
+ " <td>-73.98377</td>\n",
109
+ " <td>United States</td>\n",
110
+ " <td>...</td>\n",
111
+ " <td>$28</td>\n",
112
+ " <td>30.0</td>\n",
113
+ " <td>45.0</td>\n",
114
+ " <td>5/21/2022</td>\n",
115
+ " <td>0.38</td>\n",
116
+ " <td>4.0</td>\n",
117
+ " <td>2.0</td>\n",
118
+ " <td>228.0</td>\n",
119
+ " <td>Pet friendly but please confirm with me if the...</td>\n",
120
+ " <td>NaN</td>\n",
121
+ " </tr>\n",
122
+ " <tr>\n",
123
+ " <th>2</th>\n",
124
+ " <td>1002403</td>\n",
125
+ " <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
126
+ " <td>78829239556</td>\n",
127
+ " <td>NaN</td>\n",
128
+ " <td>Elise</td>\n",
129
+ " <td>Manhattan</td>\n",
130
+ " <td>Harlem</td>\n",
131
+ " <td>40.80902</td>\n",
132
+ " <td>-73.94190</td>\n",
133
+ " <td>United States</td>\n",
134
+ " <td>...</td>\n",
135
+ " <td>$124</td>\n",
136
+ " <td>3.0</td>\n",
137
+ " <td>0.0</td>\n",
138
+ " <td>NaN</td>\n",
139
+ " <td>NaN</td>\n",
140
+ " <td>5.0</td>\n",
141
+ " <td>1.0</td>\n",
142
+ " <td>352.0</td>\n",
143
+ " <td>I encourage you to use my kitchen, cooking and...</td>\n",
144
+ " <td>NaN</td>\n",
145
+ " </tr>\n",
146
+ " <tr>\n",
147
+ " <th>3</th>\n",
148
+ " <td>1002755</td>\n",
149
+ " <td>NaN</td>\n",
150
+ " <td>85098326012</td>\n",
151
+ " <td>unconfirmed</td>\n",
152
+ " <td>Garry</td>\n",
153
+ " <td>Brooklyn</td>\n",
154
+ " <td>Clinton Hill</td>\n",
155
+ " <td>40.68514</td>\n",
156
+ " <td>-73.95976</td>\n",
157
+ " <td>United States</td>\n",
158
+ " <td>...</td>\n",
159
+ " <td>$74</td>\n",
160
+ " <td>30.0</td>\n",
161
+ " <td>270.0</td>\n",
162
+ " <td>7/5/2019</td>\n",
163
+ " <td>4.64</td>\n",
164
+ " <td>4.0</td>\n",
165
+ " <td>1.0</td>\n",
166
+ " <td>322.0</td>\n",
167
+ " <td>NaN</td>\n",
168
+ " <td>NaN</td>\n",
169
+ " </tr>\n",
170
+ " <tr>\n",
171
+ " <th>4</th>\n",
172
+ " <td>1003689</td>\n",
173
+ " <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
174
+ " <td>92037596077</td>\n",
175
+ " <td>verified</td>\n",
176
+ " <td>Lyndon</td>\n",
177
+ " <td>Manhattan</td>\n",
178
+ " <td>East Harlem</td>\n",
179
+ " <td>40.79851</td>\n",
180
+ " <td>-73.94399</td>\n",
181
+ " <td>United States</td>\n",
182
+ " <td>...</td>\n",
183
+ " <td>$41</td>\n",
184
+ " <td>10.0</td>\n",
185
+ " <td>9.0</td>\n",
186
+ " <td>11/19/2018</td>\n",
187
+ " <td>0.10</td>\n",
188
+ " <td>3.0</td>\n",
189
+ " <td>1.0</td>\n",
190
+ " <td>289.0</td>\n",
191
+ " <td>Please no smoking in the house, porch or on th...</td>\n",
192
+ " <td>NaN</td>\n",
193
+ " </tr>\n",
194
+ " <tr>\n",
195
+ " <th>...</th>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " <td>...</td>\n",
204
+ " <td>...</td>\n",
205
+ " <td>...</td>\n",
206
+ " <td>...</td>\n",
207
+ " <td>...</td>\n",
208
+ " <td>...</td>\n",
209
+ " <td>...</td>\n",
210
+ " <td>...</td>\n",
211
+ " <td>...</td>\n",
212
+ " <td>...</td>\n",
213
+ " <td>...</td>\n",
214
+ " <td>...</td>\n",
215
+ " <td>...</td>\n",
216
+ " <td>...</td>\n",
217
+ " </tr>\n",
218
+ " <tr>\n",
219
+ " <th>102594</th>\n",
220
+ " <td>6092437</td>\n",
221
+ " <td>Spare room in Williamsburg</td>\n",
222
+ " <td>12312296767</td>\n",
223
+ " <td>verified</td>\n",
224
+ " <td>Krik</td>\n",
225
+ " <td>Brooklyn</td>\n",
226
+ " <td>Williamsburg</td>\n",
227
+ " <td>40.70862</td>\n",
228
+ " <td>-73.94651</td>\n",
229
+ " <td>United States</td>\n",
230
+ " <td>...</td>\n",
231
+ " <td>$169</td>\n",
232
+ " <td>1.0</td>\n",
233
+ " <td>0.0</td>\n",
234
+ " <td>NaN</td>\n",
235
+ " <td>NaN</td>\n",
236
+ " <td>3.0</td>\n",
237
+ " <td>1.0</td>\n",
238
+ " <td>227.0</td>\n",
239
+ " <td>No Smoking No Parties or Events of any kind Pl...</td>\n",
240
+ " <td>NaN</td>\n",
241
+ " </tr>\n",
242
+ " <tr>\n",
243
+ " <th>102595</th>\n",
244
+ " <td>6092990</td>\n",
245
+ " <td>Best Location near Columbia U</td>\n",
246
+ " <td>77864383453</td>\n",
247
+ " <td>unconfirmed</td>\n",
248
+ " <td>Mifan</td>\n",
249
+ " <td>Manhattan</td>\n",
250
+ " <td>Morningside Heights</td>\n",
251
+ " <td>40.80460</td>\n",
252
+ " <td>-73.96545</td>\n",
253
+ " <td>United States</td>\n",
254
+ " <td>...</td>\n",
255
+ " <td>$167</td>\n",
256
+ " <td>1.0</td>\n",
257
+ " <td>1.0</td>\n",
258
+ " <td>7/6/2015</td>\n",
259
+ " <td>0.02</td>\n",
260
+ " <td>2.0</td>\n",
261
+ " <td>2.0</td>\n",
262
+ " <td>395.0</td>\n",
263
+ " <td>House rules: Guests agree to the following ter...</td>\n",
264
+ " <td>NaN</td>\n",
265
+ " </tr>\n",
266
+ " <tr>\n",
267
+ " <th>102596</th>\n",
268
+ " <td>6093542</td>\n",
269
+ " <td>Comfy, bright room in Brooklyn</td>\n",
270
+ " <td>69050334417</td>\n",
271
+ " <td>unconfirmed</td>\n",
272
+ " <td>Megan</td>\n",
273
+ " <td>Brooklyn</td>\n",
274
+ " <td>Park Slope</td>\n",
275
+ " <td>40.67505</td>\n",
276
+ " <td>-73.98045</td>\n",
277
+ " <td>United States</td>\n",
278
+ " <td>...</td>\n",
279
+ " <td>$198</td>\n",
280
+ " <td>3.0</td>\n",
281
+ " <td>0.0</td>\n",
282
+ " <td>NaN</td>\n",
283
+ " <td>NaN</td>\n",
284
+ " <td>5.0</td>\n",
285
+ " <td>1.0</td>\n",
286
+ " <td>342.0</td>\n",
287
+ " <td>NaN</td>\n",
288
+ " <td>NaN</td>\n",
289
+ " </tr>\n",
290
+ " <tr>\n",
291
+ " <th>102597</th>\n",
292
+ " <td>6094094</td>\n",
293
+ " <td>Big Studio-One Stop from Midtown</td>\n",
294
+ " <td>11160591270</td>\n",
295
+ " <td>unconfirmed</td>\n",
296
+ " <td>Christopher</td>\n",
297
+ " <td>Queens</td>\n",
298
+ " <td>Long Island City</td>\n",
299
+ " <td>40.74989</td>\n",
300
+ " <td>-73.93777</td>\n",
301
+ " <td>United States</td>\n",
302
+ " <td>...</td>\n",
303
+ " <td>$109</td>\n",
304
+ " <td>2.0</td>\n",
305
+ " <td>5.0</td>\n",
306
+ " <td>10/11/2015</td>\n",
307
+ " <td>0.10</td>\n",
308
+ " <td>3.0</td>\n",
309
+ " <td>1.0</td>\n",
310
+ " <td>386.0</td>\n",
311
+ " <td>NaN</td>\n",
312
+ " <td>NaN</td>\n",
313
+ " </tr>\n",
314
+ " <tr>\n",
315
+ " <th>102598</th>\n",
316
+ " <td>6094647</td>\n",
317
+ " <td>585 sf Luxury Studio</td>\n",
318
+ " <td>68170633372</td>\n",
319
+ " <td>unconfirmed</td>\n",
320
+ " <td>Rebecca</td>\n",
321
+ " <td>Manhattan</td>\n",
322
+ " <td>Upper West Side</td>\n",
323
+ " <td>40.76807</td>\n",
324
+ " <td>-73.98342</td>\n",
325
+ " <td>United States</td>\n",
326
+ " <td>...</td>\n",
327
+ " <td>$206</td>\n",
328
+ " <td>1.0</td>\n",
329
+ " <td>0.0</td>\n",
330
+ " <td>NaN</td>\n",
331
+ " <td>NaN</td>\n",
332
+ " <td>3.0</td>\n",
333
+ " <td>1.0</td>\n",
334
+ " <td>69.0</td>\n",
335
+ " <td>NaN</td>\n",
336
+ " <td>NaN</td>\n",
337
+ " </tr>\n",
338
+ " </tbody>\n",
339
+ "</table>\n",
340
+ "<p>102599 rows × 26 columns</p>\n",
341
+ "</div>"
342
+ ],
343
+ "text/plain": [
344
+ " id NAME \n",
345
+ "0 1001254 Clean & quiet apt home by the park \\\n",
346
+ "1 1002102 Skylit Midtown Castle \n",
347
+ "2 1002403 THE VILLAGE OF HARLEM....NEW YORK ! \n",
348
+ "3 1002755 NaN \n",
349
+ "4 1003689 Entire Apt: Spacious Studio/Loft by central park \n",
350
+ "... ... ... \n",
351
+ "102594 6092437 Spare room in Williamsburg \n",
352
+ "102595 6092990 Best Location near Columbia U \n",
353
+ "102596 6093542 Comfy, bright room in Brooklyn \n",
354
+ "102597 6094094 Big Studio-One Stop from Midtown \n",
355
+ "102598 6094647 585 sf Luxury Studio \n",
356
+ "\n",
357
+ " host id host_identity_verified host name neighbourhood group \n",
358
+ "0 80014485718 unconfirmed Madaline Brooklyn \\\n",
359
+ "1 52335172823 verified Jenna Manhattan \n",
360
+ "2 78829239556 NaN Elise Manhattan \n",
361
+ "3 85098326012 unconfirmed Garry Brooklyn \n",
362
+ "4 92037596077 verified Lyndon Manhattan \n",
363
+ "... ... ... ... ... \n",
364
+ "102594 12312296767 verified Krik Brooklyn \n",
365
+ "102595 77864383453 unconfirmed Mifan Manhattan \n",
366
+ "102596 69050334417 unconfirmed Megan Brooklyn \n",
367
+ "102597 11160591270 unconfirmed Christopher Queens \n",
368
+ "102598 68170633372 unconfirmed Rebecca Manhattan \n",
369
+ "\n",
370
+ " neighbourhood lat long country ... \n",
371
+ "0 Kensington 40.64749 -73.97237 United States ... \\\n",
372
+ "1 Midtown 40.75362 -73.98377 United States ... \n",
373
+ "2 Harlem 40.80902 -73.94190 United States ... \n",
374
+ "3 Clinton Hill 40.68514 -73.95976 United States ... \n",
375
+ "4 East Harlem 40.79851 -73.94399 United States ... \n",
376
+ "... ... ... ... ... ... \n",
377
+ "102594 Williamsburg 40.70862 -73.94651 United States ... \n",
378
+ "102595 Morningside Heights 40.80460 -73.96545 United States ... \n",
379
+ "102596 Park Slope 40.67505 -73.98045 United States ... \n",
380
+ "102597 Long Island City 40.74989 -73.93777 United States ... \n",
381
+ "102598 Upper West Side 40.76807 -73.98342 United States ... \n",
382
+ "\n",
383
+ " service fee minimum nights number of reviews last review \n",
384
+ "0 $193 10.0 9.0 10/19/2021 \\\n",
385
+ "1 $28 30.0 45.0 5/21/2022 \n",
386
+ "2 $124 3.0 0.0 NaN \n",
387
+ "3 $74 30.0 270.0 7/5/2019 \n",
388
+ "4 $41 10.0 9.0 11/19/2018 \n",
389
+ "... ... ... ... ... \n",
390
+ "102594 $169 1.0 0.0 NaN \n",
391
+ "102595 $167 1.0 1.0 7/6/2015 \n",
392
+ "102596 $198 3.0 0.0 NaN \n",
393
+ "102597 $109 2.0 5.0 10/11/2015 \n",
394
+ "102598 $206 1.0 0.0 NaN \n",
395
+ "\n",
396
+ " reviews per month review rate number calculated host listings count \n",
397
+ "0 0.21 4.0 6.0 \\\n",
398
+ "1 0.38 4.0 2.0 \n",
399
+ "2 NaN 5.0 1.0 \n",
400
+ "3 4.64 4.0 1.0 \n",
401
+ "4 0.10 3.0 1.0 \n",
402
+ "... ... ... ... \n",
403
+ "102594 NaN 3.0 1.0 \n",
404
+ "102595 0.02 2.0 2.0 \n",
405
+ "102596 NaN 5.0 1.0 \n",
406
+ "102597 0.10 3.0 1.0 \n",
407
+ "102598 NaN 3.0 1.0 \n",
408
+ "\n",
409
+ " availability 365 house_rules \n",
410
+ "0 286.0 Clean up and treat the home the way you'd like... \\\n",
411
+ "1 228.0 Pet friendly but please confirm with me if the... \n",
412
+ "2 352.0 I encourage you to use my kitchen, cooking and... \n",
413
+ "3 322.0 NaN \n",
414
+ "4 289.0 Please no smoking in the house, porch or on th... \n",
415
+ "... ... ... \n",
416
+ "102594 227.0 No Smoking No Parties or Events of any kind Pl... \n",
417
+ "102595 395.0 House rules: Guests agree to the following ter... \n",
418
+ "102596 342.0 NaN \n",
419
+ "102597 386.0 NaN \n",
420
+ "102598 69.0 NaN \n",
421
+ "\n",
422
+ " license \n",
423
+ "0 NaN \n",
424
+ "1 NaN \n",
425
+ "2 NaN \n",
426
+ "3 NaN \n",
427
+ "4 NaN \n",
428
+ "... ... \n",
429
+ "102594 NaN \n",
430
+ "102595 NaN \n",
431
+ "102596 NaN \n",
432
+ "102597 NaN \n",
433
+ "102598 NaN \n",
434
+ "\n",
435
+ "[102599 rows x 26 columns]"
436
+ ]
437
+ },
438
+ "execution_count": 2,
439
+ "metadata": {},
440
+ "output_type": "execute_result"
441
+ }
442
+ ],
443
+ "source": [
444
+ "data"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 3,
450
+ "id": "e21af5d1",
451
+ "metadata": {},
452
+ "outputs": [],
453
+ "source": [
454
+ "flight = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/flights/clean_Flights_2022.csv')"
455
+ ]
456
+ },
457
+ {
458
+ "cell_type": "code",
459
+ "execution_count": 4,
460
+ "id": "966feef9",
461
+ "metadata": {},
462
+ "outputs": [],
463
+ "source": [
464
+ "flight = flight.to_dict(orient = 'split')"
465
+ ]
466
+ },
467
+ {
468
+ "cell_type": "code",
469
+ "execution_count": 5,
470
+ "id": "3f4fe062",
471
+ "metadata": {},
472
+ "outputs": [],
473
+ "source": [
474
+ "data_dict = data.to_dict(orient = 'split')"
475
+ ]
476
+ },
477
+ {
478
+ "cell_type": "code",
479
+ "execution_count": 6,
480
+ "id": "33213ac0",
481
+ "metadata": {},
482
+ "outputs": [
483
+ {
484
+ "data": {
485
+ "text/plain": [
486
+ "[2, '2022-04-04', '15:14', '16:36', 251.0, 'Durango', 'Denver', 100]"
487
+ ]
488
+ },
489
+ "execution_count": 6,
490
+ "metadata": {},
491
+ "output_type": "execute_result"
492
+ }
493
+ ],
494
+ "source": [
495
+ "flight['data'][2]"
496
+ ]
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "execution_count": 8,
501
+ "id": "9cef6161",
502
+ "metadata": {},
503
+ "outputs": [
504
+ {
505
+ "name": "stdout",
506
+ "output_type": "stream",
507
+ "text": [
508
+ "nan\n"
509
+ ]
510
+ }
511
+ ],
512
+ "source": [
513
+ "print(str(data_dict['data'][3][24]))"
514
+ ]
515
+ },
516
+ {
517
+ "cell_type": "code",
518
+ "execution_count": 9,
519
+ "id": "c5f81f43",
520
+ "metadata": {},
521
+ "outputs": [],
522
+ "source": [
523
+ "city_set = set()\n",
524
+ "cnt = 0\n",
525
+ "for unit in data_dict['data']:\n",
526
+ " if str(unit[24]) != 'nan':\n",
527
+ " cnt += 1"
528
+ ]
529
+ },
530
+ {
531
+ "cell_type": "code",
532
+ "execution_count": 10,
533
+ "id": "533a5aa6",
534
+ "metadata": {},
535
+ "outputs": [
536
+ {
537
+ "data": {
538
+ "text/plain": [
539
+ "50468"
540
+ ]
541
+ },
542
+ "execution_count": 10,
543
+ "metadata": {},
544
+ "output_type": "execute_result"
545
+ }
546
+ ],
547
+ "source": [
548
+ "cnt"
549
+ ]
550
+ },
551
+ {
552
+ "cell_type": "code",
553
+ "execution_count": 11,
554
+ "id": "bfce5f56",
555
+ "metadata": {},
556
+ "outputs": [
557
+ {
558
+ "data": {
559
+ "text/plain": [
560
+ "set()"
561
+ ]
562
+ },
563
+ "execution_count": 11,
564
+ "metadata": {},
565
+ "output_type": "execute_result"
566
+ }
567
+ ],
568
+ "source": [
569
+ "city_set"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "code",
574
+ "execution_count": 12,
575
+ "id": "230b760c",
576
+ "metadata": {},
577
+ "outputs": [
578
+ {
579
+ "ename": "ValueError",
580
+ "evalue": "Sample larger than population or is negative",
581
+ "output_type": "error",
582
+ "traceback": [
583
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
584
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
585
+ "Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrandom\u001b[39;00m\n\u001b[1;32m 2\u001b[0m city_set \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(city_set)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mrandom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcity_set\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m)\n",
586
+ "File \u001b[0;32m~/miniconda3/envs/py39/lib/python3.9/random.py:449\u001b[0m, in \u001b[0;36mRandom.sample\u001b[0;34m(self, population, k, counts)\u001b[0m\n\u001b[1;32m 447\u001b[0m randbelow \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_randbelow\n\u001b[1;32m 448\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m0\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m k \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m n:\n\u001b[0;32m--> 449\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSample larger than population or is negative\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 450\u001b[0m result \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;01mNone\u001b[39;00m] \u001b[38;5;241m*\u001b[39m k\n\u001b[1;32m 451\u001b[0m setsize \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m21\u001b[39m \u001b[38;5;66;03m# size of a small set minus size of an empty list\u001b[39;00m\n",
587
+ "\u001b[0;31mValueError\u001b[0m: Sample larger than population or is negative"
588
+ ]
589
+ }
590
+ ],
591
+ "source": [
592
+ "import random\n",
593
+ "city_set = list(city_set)\n",
594
+ "print(random.sample(city_set,1))"
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "code",
599
+ "execution_count": 12,
600
+ "id": "61eddd5f",
601
+ "metadata": {},
602
+ "outputs": [
603
+ {
604
+ "ename": "AttributeError",
605
+ "evalue": "'dict' object has no attribute 'to_dict'",
606
+ "output_type": "error",
607
+ "traceback": [
608
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
609
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
610
+ "Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data_dict \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_dict\u001b[49m(orient \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msplit\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
611
+ "\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'to_dict'"
612
+ ]
613
+ }
614
+ ],
615
+ "source": [
616
+ "data_dict = data.to_dict(orient = 'split')"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 35,
622
+ "id": "3292c450",
623
+ "metadata": {},
624
+ "outputs": [
625
+ {
626
+ "data": {
627
+ "text/plain": [
628
+ "['Unnamed: 0',\n",
629
+ " 'NAME',\n",
630
+ " 'room type',\n",
631
+ " 'price',\n",
632
+ " 'minimum nights',\n",
633
+ " 'review rate number',\n",
634
+ " 'house_rules',\n",
635
+ " 'maximum occupancy',\n",
636
+ " 'city']"
637
+ ]
638
+ },
639
+ "execution_count": 35,
640
+ "metadata": {},
641
+ "output_type": "execute_result"
642
+ }
643
+ ],
644
+ "source": [
645
+ "data_dict['columns']"
646
+ ]
647
+ },
648
+ {
649
+ "cell_type": "code",
650
+ "execution_count": 38,
651
+ "id": "cfaa21d9",
652
+ "metadata": {},
653
+ "outputs": [
654
+ {
655
+ "data": {
656
+ "text/plain": [
657
+ "5047"
658
+ ]
659
+ },
660
+ "execution_count": 38,
661
+ "metadata": {},
662
+ "output_type": "execute_result"
663
+ }
664
+ ],
665
+ "source": [
666
+ "len(data_dict['data'])"
667
+ ]
668
+ },
669
+ {
670
+ "cell_type": "code",
671
+ "execution_count": 36,
672
+ "id": "2980362d",
673
+ "metadata": {},
674
+ "outputs": [],
675
+ "source": [
676
+ "type_set = set()\n",
677
+ "for unit in data_dict['data']:\n",
678
+ " type_set.add(unit[2])"
679
+ ]
680
+ },
681
+ {
682
+ "cell_type": "code",
683
+ "execution_count": 37,
684
+ "id": "f5e36fbb",
685
+ "metadata": {},
686
+ "outputs": [
687
+ {
688
+ "data": {
689
+ "text/plain": [
690
+ "{'Entire home/apt', 'Private room', 'Shared room'}"
691
+ ]
692
+ },
693
+ "execution_count": 37,
694
+ "metadata": {},
695
+ "output_type": "execute_result"
696
+ }
697
+ ],
698
+ "source": [
699
+ "type_set"
700
+ ]
701
+ },
702
+ {
703
+ "cell_type": "code",
704
+ "execution_count": 15,
705
+ "id": "bf1231c4",
706
+ "metadata": {},
707
+ "outputs": [
708
+ {
709
+ "ename": "NameError",
710
+ "evalue": "name 'data_dict' is not defined",
711
+ "output_type": "error",
712
+ "traceback": [
713
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
714
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
715
+ "Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdata_dict\u001b[49m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m'\u001b[39m][\u001b[38;5;241m147\u001b[39m]\n",
716
+ "\u001b[0;31mNameError\u001b[0m: name 'data_dict' is not defined"
717
+ ]
718
+ }
719
+ ],
720
+ "source": [
721
+ "data_dict['data'][147]"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "code",
726
+ "execution_count": 14,
727
+ "id": "f993b894",
728
+ "metadata": {},
729
+ "outputs": [
730
+ {
731
+ "data": {
732
+ "text/plain": [
733
+ "set()"
734
+ ]
735
+ },
736
+ "execution_count": 14,
737
+ "metadata": {},
738
+ "output_type": "execute_result"
739
+ }
740
+ ],
741
+ "source": [
742
+ "type_set"
743
+ ]
744
+ },
745
+ {
746
+ "cell_type": "code",
747
+ "execution_count": 10,
748
+ "id": "916e9470",
749
+ "metadata": {},
750
+ "outputs": [
751
+ {
752
+ "name": "stdout",
753
+ "output_type": "stream",
754
+ "text": [
755
+ "1 NAME\n",
756
+ "7 lat\n",
757
+ "8 long\n",
758
+ "13 room type\n",
759
+ "15 price\n",
760
+ "17 minimum nights\n",
761
+ "21 review rate number\n",
762
+ "24 house_rules\n"
763
+ ]
764
+ }
765
+ ],
766
+ "source": [
767
+ "for idx, unit in enumerate(data_dict['columns']):\n",
768
+ " if unit in ['NAME','lat', 'long', 'room type', 'price','minimum nights','review rate number','house_rules']:\n",
769
+ " print(idx,unit)"
770
+ ]
771
+ },
772
+ {
773
+ "cell_type": "code",
774
+ "execution_count": 73,
775
+ "id": "1213484d",
776
+ "metadata": {},
777
+ "outputs": [
778
+ {
779
+ "data": {
780
+ "application/vnd.jupyter.widget-view+json": {
781
+ "model_id": "51764c1a3739416289913ec613816cc7",
782
+ "version_major": 2,
783
+ "version_minor": 0
784
+ },
785
+ "text/plain": [
786
+ "0it [00:00, ?it/s]"
787
+ ]
788
+ },
789
+ "metadata": {},
790
+ "output_type": "display_data"
791
+ },
792
+ {
793
+ "name": "stderr",
794
+ "output_type": "stream",
795
+ "text": [
796
+ "/tmp/ipykernel_3241846/557604333.py:23: DeprecationWarning: Sampling from a set deprecated\n",
797
+ "since Python 3.9 and will be removed in a subsequent version.\n",
798
+ " tmp_dict[\"city\"] = random.sample(city_set,1)[0]\n"
799
+ ]
800
+ },
801
+ {
802
+ "ename": "ValueError",
803
+ "evalue": "Sample larger than population or is negative",
804
+ "output_type": "error",
805
+ "traceback": [
806
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
807
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
808
+ "Cell \u001b[0;32mIn[73], line 23\u001b[0m\n\u001b[1;32m 21\u001b[0m tmp_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreview rate number\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m unit[\u001b[38;5;241m21\u001b[39m]\n\u001b[1;32m 22\u001b[0m tmp_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhouse_rules\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m unit[\u001b[38;5;241m24\u001b[39m]\n\u001b[0;32m---> 23\u001b[0m tmp_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcity\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mrandom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcity_set\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 24\u001b[0m new_data\u001b[38;5;241m.\u001b[39mappend(tmp_dict)\n",
809
+ "File \u001b[0;32m~/miniconda3/envs/py39/lib/python3.9/random.py:449\u001b[0m, in \u001b[0;36mRandom.sample\u001b[0;34m(self, population, k, counts)\u001b[0m\n\u001b[1;32m 447\u001b[0m randbelow \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_randbelow\n\u001b[1;32m 448\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m0\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m k \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m n:\n\u001b[0;32m--> 449\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSample larger than population or is negative\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 450\u001b[0m result \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;01mNone\u001b[39;00m] \u001b[38;5;241m*\u001b[39m k\n\u001b[1;32m 451\u001b[0m setsize \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m21\u001b[39m \u001b[38;5;66;03m# size of a small set minus size of an empty list\u001b[39;00m\n",
810
+ "\u001b[0;31mValueError\u001b[0m: Sample larger than population or is negative"
811
+ ]
812
+ }
813
+ ],
814
+ "source": [
815
+ "from tqdm.autonotebook import tqdm\n",
816
+ "import random\n",
817
+ "new_data = []\n",
818
+ "for idx, unit in tqdm(enumerate(data_dict['data'])):\n",
819
+ " tmp_dict = {k:\"\" for k in ['NAME','room type', 'price','minimum nights','review rate number','house_rules']}\n",
820
+ " tmp_dict[\"NAME\"] = unit[1]\n",
821
+ " tmp_dict[\"room type\"] = unit[13]\n",
822
+ " if unit[13] == \"Shared room\":\n",
823
+ " tmp_dict[\"maximum occupancy\"] = 1\n",
824
+ " elif unit[13] == \"Hotel room\":\n",
825
+ " tmp_dict[\"maximum occupancy\"] = random.randint(1, 2)\n",
826
+ " elif unit[13] == \"Private room\":\n",
827
+ " tmp_dict[\"maximum occupancy\"] = random.randint(1, 2)\n",
828
+ " elif unit[13] == \"Entire home/apt\":\n",
829
+ " try:\n",
830
+ " tmp_dict[\"maximum occupancy\"] = random.randint(2, max(3,eval(unit[15].replace(\"$\",\"\").replace(\",\",\"\"))//100))\n",
831
+ " except:\n",
832
+ " tmp_dict[\"maximum occupancy\"] = random.randint(2, max(3,unit[15]//100))\n",
833
+ " tmp_dict[\"price\"] = unit[15].replace(\"$\",\"\").replace(\",\",\"\")\n",
834
+ " tmp_dict[\"minimum nights\"] = unit[17]\n",
835
+ " tmp_dict[\"review rate number\"] = unit[21]\n",
836
+ " tmp_dict[\"house_rules\"] = unit[24]\n",
837
+ " tmp_dict[\"city\"] = random.sample(city_set,1)[0]\n",
838
+ " new_data.append(tmp_dict)"
839
+ ]
840
+ },
841
+ {
842
+ "cell_type": "code",
843
+ "execution_count": 20,
844
+ "id": "fd3e8257",
845
+ "metadata": {},
846
+ "outputs": [
847
+ {
848
+ "data": {
849
+ "text/plain": [
850
+ "102599"
851
+ ]
852
+ },
853
+ "execution_count": 20,
854
+ "metadata": {},
855
+ "output_type": "execute_result"
856
+ }
857
+ ],
858
+ "source": [
859
+ "len(new_data)"
860
+ ]
861
+ },
862
+ {
863
+ "cell_type": "code",
864
+ "execution_count": 21,
865
+ "id": "bfb243c0",
866
+ "metadata": {},
867
+ "outputs": [],
868
+ "source": [
869
+ "df = pd.DataFrame(new_data)"
870
+ ]
871
+ },
872
+ {
873
+ "cell_type": "code",
874
+ "execution_count": 23,
875
+ "id": "af7e3411",
876
+ "metadata": {},
877
+ "outputs": [],
878
+ "source": [
879
+ "df.to_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
880
+ ]
881
+ },
882
+ {
883
+ "cell_type": "code",
884
+ "execution_count": 22,
885
+ "id": "71d21fea",
886
+ "metadata": {},
887
+ "outputs": [
888
+ {
889
+ "data": {
890
+ "text/html": [
891
+ "<div>\n",
892
+ "<style scoped>\n",
893
+ " .dataframe tbody tr th:only-of-type {\n",
894
+ " vertical-align: middle;\n",
895
+ " }\n",
896
+ "\n",
897
+ " .dataframe tbody tr th {\n",
898
+ " vertical-align: top;\n",
899
+ " }\n",
900
+ "\n",
901
+ " .dataframe thead th {\n",
902
+ " text-align: right;\n",
903
+ " }\n",
904
+ "</style>\n",
905
+ "<table border=\"1\" class=\"dataframe\">\n",
906
+ " <thead>\n",
907
+ " <tr style=\"text-align: right;\">\n",
908
+ " <th></th>\n",
909
+ " <th>NAME</th>\n",
910
+ " <th>room type</th>\n",
911
+ " <th>price</th>\n",
912
+ " <th>minimum nights</th>\n",
913
+ " <th>review rate number</th>\n",
914
+ " <th>house_rules</th>\n",
915
+ " <th>maximum occupancy</th>\n",
916
+ " <th>city</th>\n",
917
+ " </tr>\n",
918
+ " </thead>\n",
919
+ " <tbody>\n",
920
+ " <tr>\n",
921
+ " <th>0</th>\n",
922
+ " <td>Clean &amp; quiet apt home by the park</td>\n",
923
+ " <td>Private room</td>\n",
924
+ " <td>$966</td>\n",
925
+ " <td>10.0</td>\n",
926
+ " <td>4.0</td>\n",
927
+ " <td>Clean up and treat the home the way you'd like...</td>\n",
928
+ " <td>1</td>\n",
929
+ " <td>Des Moines</td>\n",
930
+ " </tr>\n",
931
+ " <tr>\n",
932
+ " <th>1</th>\n",
933
+ " <td>Skylit Midtown Castle</td>\n",
934
+ " <td>Entire home/apt</td>\n",
935
+ " <td>$142</td>\n",
936
+ " <td>30.0</td>\n",
937
+ " <td>4.0</td>\n",
938
+ " <td>Pet friendly but please confirm with me if the...</td>\n",
939
+ " <td>2</td>\n",
940
+ " <td>Wilmington</td>\n",
941
+ " </tr>\n",
942
+ " <tr>\n",
943
+ " <th>2</th>\n",
944
+ " <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
945
+ " <td>Private room</td>\n",
946
+ " <td>$620</td>\n",
947
+ " <td>3.0</td>\n",
948
+ " <td>5.0</td>\n",
949
+ " <td>I encourage you to use my kitchen, cooking and...</td>\n",
950
+ " <td>2</td>\n",
951
+ " <td>St. George</td>\n",
952
+ " </tr>\n",
953
+ " <tr>\n",
954
+ " <th>3</th>\n",
955
+ " <td>NaN</td>\n",
956
+ " <td>Entire home/apt</td>\n",
957
+ " <td>$368</td>\n",
958
+ " <td>30.0</td>\n",
959
+ " <td>4.0</td>\n",
960
+ " <td>NaN</td>\n",
961
+ " <td>2</td>\n",
962
+ " <td>Kalamazoo</td>\n",
963
+ " </tr>\n",
964
+ " <tr>\n",
965
+ " <th>4</th>\n",
966
+ " <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
967
+ " <td>Entire home/apt</td>\n",
968
+ " <td>$204</td>\n",
969
+ " <td>10.0</td>\n",
970
+ " <td>3.0</td>\n",
971
+ " <td>Please no smoking in the house, porch or on th...</td>\n",
972
+ " <td>3</td>\n",
973
+ " <td>Cheyenne</td>\n",
974
+ " </tr>\n",
975
+ " <tr>\n",
976
+ " <th>...</th>\n",
977
+ " <td>...</td>\n",
978
+ " <td>...</td>\n",
979
+ " <td>...</td>\n",
980
+ " <td>...</td>\n",
981
+ " <td>...</td>\n",
982
+ " <td>...</td>\n",
983
+ " <td>...</td>\n",
984
+ " <td>...</td>\n",
985
+ " </tr>\n",
986
+ " <tr>\n",
987
+ " <th>102594</th>\n",
988
+ " <td>Spare room in Williamsburg</td>\n",
989
+ " <td>Private room</td>\n",
990
+ " <td>$844</td>\n",
991
+ " <td>1.0</td>\n",
992
+ " <td>3.0</td>\n",
993
+ " <td>No Smoking No Parties or Events of any kind Pl...</td>\n",
994
+ " <td>1</td>\n",
995
+ " <td>White Plains</td>\n",
996
+ " </tr>\n",
997
+ " <tr>\n",
998
+ " <th>102595</th>\n",
999
+ " <td>Best Location near Columbia U</td>\n",
1000
+ " <td>Private room</td>\n",
1001
+ " <td>$837</td>\n",
1002
+ " <td>1.0</td>\n",
1003
+ " <td>2.0</td>\n",
1004
+ " <td>House rules: Guests agree to the following ter...</td>\n",
1005
+ " <td>2</td>\n",
1006
+ " <td>Mosinee</td>\n",
1007
+ " </tr>\n",
1008
+ " <tr>\n",
1009
+ " <th>102596</th>\n",
1010
+ " <td>Comfy, bright room in Brooklyn</td>\n",
1011
+ " <td>Private room</td>\n",
1012
+ " <td>$988</td>\n",
1013
+ " <td>3.0</td>\n",
1014
+ " <td>5.0</td>\n",
1015
+ " <td>NaN</td>\n",
1016
+ " <td>2</td>\n",
1017
+ " <td>Amarillo</td>\n",
1018
+ " </tr>\n",
1019
+ " <tr>\n",
1020
+ " <th>102597</th>\n",
1021
+ " <td>Big Studio-One Stop from Midtown</td>\n",
1022
+ " <td>Entire home/apt</td>\n",
1023
+ " <td>$546</td>\n",
1024
+ " <td>2.0</td>\n",
1025
+ " <td>3.0</td>\n",
1026
+ " <td>NaN</td>\n",
1027
+ " <td>4</td>\n",
1028
+ " <td>Binghamton</td>\n",
1029
+ " </tr>\n",
1030
+ " <tr>\n",
1031
+ " <th>102598</th>\n",
1032
+ " <td>585 sf Luxury Studio</td>\n",
1033
+ " <td>Entire home/apt</td>\n",
1034
+ " <td>$1,032</td>\n",
1035
+ " <td>1.0</td>\n",
1036
+ " <td>3.0</td>\n",
1037
+ " <td>NaN</td>\n",
1038
+ " <td>7</td>\n",
1039
+ " <td>Flint</td>\n",
1040
+ " </tr>\n",
1041
+ " </tbody>\n",
1042
+ "</table>\n",
1043
+ "<p>102599 rows × 8 columns</p>\n",
1044
+ "</div>"
1045
+ ],
1046
+ "text/plain": [
1047
+ " NAME room type \n",
1048
+ "0 Clean & quiet apt home by the park Private room \\\n",
1049
+ "1 Skylit Midtown Castle Entire home/apt \n",
1050
+ "2 THE VILLAGE OF HARLEM....NEW YORK ! Private room \n",
1051
+ "3 NaN Entire home/apt \n",
1052
+ "4 Entire Apt: Spacious Studio/Loft by central park Entire home/apt \n",
1053
+ "... ... ... \n",
1054
+ "102594 Spare room in Williamsburg Private room \n",
1055
+ "102595 Best Location near Columbia U Private room \n",
1056
+ "102596 Comfy, bright room in Brooklyn Private room \n",
1057
+ "102597 Big Studio-One Stop from Midtown Entire home/apt \n",
1058
+ "102598 585 sf Luxury Studio Entire home/apt \n",
1059
+ "\n",
1060
+ " price minimum nights review rate number \n",
1061
+ "0 $966 10.0 4.0 \\\n",
1062
+ "1 $142 30.0 4.0 \n",
1063
+ "2 $620 3.0 5.0 \n",
1064
+ "3 $368 30.0 4.0 \n",
1065
+ "4 $204 10.0 3.0 \n",
1066
+ "... ... ... ... \n",
1067
+ "102594 $844 1.0 3.0 \n",
1068
+ "102595 $837 1.0 2.0 \n",
1069
+ "102596 $988 3.0 5.0 \n",
1070
+ "102597 $546 2.0 3.0 \n",
1071
+ "102598 $1,032 1.0 3.0 \n",
1072
+ "\n",
1073
+ " house_rules maximum occupancy \n",
1074
+ "0 Clean up and treat the home the way you'd like... 1 \\\n",
1075
+ "1 Pet friendly but please confirm with me if the... 2 \n",
1076
+ "2 I encourage you to use my kitchen, cooking and... 2 \n",
1077
+ "3 NaN 2 \n",
1078
+ "4 Please no smoking in the house, porch or on th... 3 \n",
1079
+ "... ... ... \n",
1080
+ "102594 No Smoking No Parties or Events of any kind Pl... 1 \n",
1081
+ "102595 House rules: Guests agree to the following ter... 2 \n",
1082
+ "102596 NaN 2 \n",
1083
+ "102597 NaN 4 \n",
1084
+ "102598 NaN 7 \n",
1085
+ "\n",
1086
+ " city \n",
1087
+ "0 Des Moines \n",
1088
+ "1 Wilmington \n",
1089
+ "2 St. George \n",
1090
+ "3 Kalamazoo \n",
1091
+ "4 Cheyenne \n",
1092
+ "... ... \n",
1093
+ "102594 White Plains \n",
1094
+ "102595 Mosinee \n",
1095
+ "102596 Amarillo \n",
1096
+ "102597 Binghamton \n",
1097
+ "102598 Flint \n",
1098
+ "\n",
1099
+ "[102599 rows x 8 columns]"
1100
+ ]
1101
+ },
1102
+ "execution_count": 22,
1103
+ "metadata": {},
1104
+ "output_type": "execute_result"
1105
+ }
1106
+ ],
1107
+ "source": [
1108
+ "df"
1109
+ ]
1110
+ },
1111
+ {
1112
+ "cell_type": "code",
1113
+ "execution_count": 50,
1114
+ "id": "0ec56283",
1115
+ "metadata": {},
1116
+ "outputs": [],
1117
+ "source": [
1118
+ "import pandas as pd\n",
1119
+ "data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
1120
+ ]
1121
+ },
1122
+ {
1123
+ "cell_type": "code",
1124
+ "execution_count": 52,
1125
+ "id": "5dc27048",
1126
+ "metadata": {},
1127
+ "outputs": [
1128
+ {
1129
+ "data": {
1130
+ "text/html": [
1131
+ "<div>\n",
1132
+ "<style scoped>\n",
1133
+ " .dataframe tbody tr th:only-of-type {\n",
1134
+ " vertical-align: middle;\n",
1135
+ " }\n",
1136
+ "\n",
1137
+ " .dataframe tbody tr th {\n",
1138
+ " vertical-align: top;\n",
1139
+ " }\n",
1140
+ "\n",
1141
+ " .dataframe thead th {\n",
1142
+ " text-align: right;\n",
1143
+ " }\n",
1144
+ "</style>\n",
1145
+ "<table border=\"1\" class=\"dataframe\">\n",
1146
+ " <thead>\n",
1147
+ " <tr style=\"text-align: right;\">\n",
1148
+ " <th></th>\n",
1149
+ " <th>Unnamed: 0</th>\n",
1150
+ " <th>NAME</th>\n",
1151
+ " <th>room type</th>\n",
1152
+ " <th>price</th>\n",
1153
+ " <th>minimum nights</th>\n",
1154
+ " <th>review rate number</th>\n",
1155
+ " <th>house_rules</th>\n",
1156
+ " <th>maximum occupancy</th>\n",
1157
+ " <th>city</th>\n",
1158
+ " </tr>\n",
1159
+ " </thead>\n",
1160
+ " <tbody>\n",
1161
+ " <tr>\n",
1162
+ " <th>0</th>\n",
1163
+ " <td>0</td>\n",
1164
+ " <td>Clean &amp; quiet apt home by the park</td>\n",
1165
+ " <td>Private room</td>\n",
1166
+ " <td>$966</td>\n",
1167
+ " <td>10.0</td>\n",
1168
+ " <td>4.0</td>\n",
1169
+ " <td>Clean up and treat the home the way you'd like...</td>\n",
1170
+ " <td>1</td>\n",
1171
+ " <td>Des Moines</td>\n",
1172
+ " </tr>\n",
1173
+ " <tr>\n",
1174
+ " <th>1</th>\n",
1175
+ " <td>1</td>\n",
1176
+ " <td>Skylit Midtown Castle</td>\n",
1177
+ " <td>Entire home/apt</td>\n",
1178
+ " <td>$142</td>\n",
1179
+ " <td>30.0</td>\n",
1180
+ " <td>4.0</td>\n",
1181
+ " <td>Pet friendly but please confirm with me if the...</td>\n",
1182
+ " <td>2</td>\n",
1183
+ " <td>Wilmington</td>\n",
1184
+ " </tr>\n",
1185
+ " <tr>\n",
1186
+ " <th>2</th>\n",
1187
+ " <td>2</td>\n",
1188
+ " <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
1189
+ " <td>Private room</td>\n",
1190
+ " <td>$620</td>\n",
1191
+ " <td>3.0</td>\n",
1192
+ " <td>5.0</td>\n",
1193
+ " <td>I encourage you to use my kitchen, cooking and...</td>\n",
1194
+ " <td>2</td>\n",
1195
+ " <td>St. George</td>\n",
1196
+ " </tr>\n",
1197
+ " <tr>\n",
1198
+ " <th>3</th>\n",
1199
+ " <td>3</td>\n",
1200
+ " <td>NaN</td>\n",
1201
+ " <td>Entire home/apt</td>\n",
1202
+ " <td>$368</td>\n",
1203
+ " <td>30.0</td>\n",
1204
+ " <td>4.0</td>\n",
1205
+ " <td>NaN</td>\n",
1206
+ " <td>2</td>\n",
1207
+ " <td>Kalamazoo</td>\n",
1208
+ " </tr>\n",
1209
+ " <tr>\n",
1210
+ " <th>4</th>\n",
1211
+ " <td>4</td>\n",
1212
+ " <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
1213
+ " <td>Entire home/apt</td>\n",
1214
+ " <td>$204</td>\n",
1215
+ " <td>10.0</td>\n",
1216
+ " <td>3.0</td>\n",
1217
+ " <td>Please no smoking in the house, porch or on th...</td>\n",
1218
+ " <td>3</td>\n",
1219
+ " <td>Cheyenne</td>\n",
1220
+ " </tr>\n",
1221
+ " <tr>\n",
1222
+ " <th>...</th>\n",
1223
+ " <td>...</td>\n",
1224
+ " <td>...</td>\n",
1225
+ " <td>...</td>\n",
1226
+ " <td>...</td>\n",
1227
+ " <td>...</td>\n",
1228
+ " <td>...</td>\n",
1229
+ " <td>...</td>\n",
1230
+ " <td>...</td>\n",
1231
+ " <td>...</td>\n",
1232
+ " </tr>\n",
1233
+ " <tr>\n",
1234
+ " <th>102594</th>\n",
1235
+ " <td>102594</td>\n",
1236
+ " <td>Spare room in Williamsburg</td>\n",
1237
+ " <td>Private room</td>\n",
1238
+ " <td>$844</td>\n",
1239
+ " <td>1.0</td>\n",
1240
+ " <td>3.0</td>\n",
1241
+ " <td>No Smoking No Parties or Events of any kind Pl...</td>\n",
1242
+ " <td>1</td>\n",
1243
+ " <td>White Plains</td>\n",
1244
+ " </tr>\n",
1245
+ " <tr>\n",
1246
+ " <th>102595</th>\n",
1247
+ " <td>102595</td>\n",
1248
+ " <td>Best Location near Columbia U</td>\n",
1249
+ " <td>Private room</td>\n",
1250
+ " <td>$837</td>\n",
1251
+ " <td>1.0</td>\n",
1252
+ " <td>2.0</td>\n",
1253
+ " <td>House rules: Guests agree to the following ter...</td>\n",
1254
+ " <td>2</td>\n",
1255
+ " <td>Mosinee</td>\n",
1256
+ " </tr>\n",
1257
+ " <tr>\n",
1258
+ " <th>102596</th>\n",
1259
+ " <td>102596</td>\n",
1260
+ " <td>Comfy, bright room in Brooklyn</td>\n",
1261
+ " <td>Private room</td>\n",
1262
+ " <td>$988</td>\n",
1263
+ " <td>3.0</td>\n",
1264
+ " <td>5.0</td>\n",
1265
+ " <td>NaN</td>\n",
1266
+ " <td>2</td>\n",
1267
+ " <td>Amarillo</td>\n",
1268
+ " </tr>\n",
1269
+ " <tr>\n",
1270
+ " <th>102597</th>\n",
1271
+ " <td>102597</td>\n",
1272
+ " <td>Big Studio-One Stop from Midtown</td>\n",
1273
+ " <td>Entire home/apt</td>\n",
1274
+ " <td>$546</td>\n",
1275
+ " <td>2.0</td>\n",
1276
+ " <td>3.0</td>\n",
1277
+ " <td>NaN</td>\n",
1278
+ " <td>4</td>\n",
1279
+ " <td>Binghamton</td>\n",
1280
+ " </tr>\n",
1281
+ " <tr>\n",
1282
+ " <th>102598</th>\n",
1283
+ " <td>102598</td>\n",
1284
+ " <td>585 sf Luxury Studio</td>\n",
1285
+ " <td>Entire home/apt</td>\n",
1286
+ " <td>$1,032</td>\n",
1287
+ " <td>1.0</td>\n",
1288
+ " <td>3.0</td>\n",
1289
+ " <td>NaN</td>\n",
1290
+ " <td>7</td>\n",
1291
+ " <td>Flint</td>\n",
1292
+ " </tr>\n",
1293
+ " </tbody>\n",
1294
+ "</table>\n",
1295
+ "<p>102599 rows × 9 columns</p>\n",
1296
+ "</div>"
1297
+ ],
1298
+ "text/plain": [
1299
+ " Unnamed: 0 NAME \n",
1300
+ "0 0 Clean & quiet apt home by the park \\\n",
1301
+ "1 1 Skylit Midtown Castle \n",
1302
+ "2 2 THE VILLAGE OF HARLEM....NEW YORK ! \n",
1303
+ "3 3 NaN \n",
1304
+ "4 4 Entire Apt: Spacious Studio/Loft by central park \n",
1305
+ "... ... ... \n",
1306
+ "102594 102594 Spare room in Williamsburg \n",
1307
+ "102595 102595 Best Location near Columbia U \n",
1308
+ "102596 102596 Comfy, bright room in Brooklyn \n",
1309
+ "102597 102597 Big Studio-One Stop from Midtown \n",
1310
+ "102598 102598 585 sf Luxury Studio \n",
1311
+ "\n",
1312
+ " room type price minimum nights review rate number \n",
1313
+ "0 Private room $966 10.0 4.0 \\\n",
1314
+ "1 Entire home/apt $142 30.0 4.0 \n",
1315
+ "2 Private room $620 3.0 5.0 \n",
1316
+ "3 Entire home/apt $368 30.0 4.0 \n",
1317
+ "4 Entire home/apt $204 10.0 3.0 \n",
1318
+ "... ... ... ... ... \n",
1319
+ "102594 Private room $844 1.0 3.0 \n",
1320
+ "102595 Private room $837 1.0 2.0 \n",
1321
+ "102596 Private room $988 3.0 5.0 \n",
1322
+ "102597 Entire home/apt $546 2.0 3.0 \n",
1323
+ "102598 Entire home/apt $1,032 1.0 3.0 \n",
1324
+ "\n",
1325
+ " house_rules maximum occupancy \n",
1326
+ "0 Clean up and treat the home the way you'd like... 1 \\\n",
1327
+ "1 Pet friendly but please confirm with me if the... 2 \n",
1328
+ "2 I encourage you to use my kitchen, cooking and... 2 \n",
1329
+ "3 NaN 2 \n",
1330
+ "4 Please no smoking in the house, porch or on th... 3 \n",
1331
+ "... ... ... \n",
1332
+ "102594 No Smoking No Parties or Events of any kind Pl... 1 \n",
1333
+ "102595 House rules: Guests agree to the following ter... 2 \n",
1334
+ "102596 NaN 2 \n",
1335
+ "102597 NaN 4 \n",
1336
+ "102598 NaN 7 \n",
1337
+ "\n",
1338
+ " city \n",
1339
+ "0 Des Moines \n",
1340
+ "1 Wilmington \n",
1341
+ "2 St. George \n",
1342
+ "3 Kalamazoo \n",
1343
+ "4 Cheyenne \n",
1344
+ "... ... \n",
1345
+ "102594 White Plains \n",
1346
+ "102595 Mosinee \n",
1347
+ "102596 Amarillo \n",
1348
+ "102597 Binghamton \n",
1349
+ "102598 Flint \n",
1350
+ "\n",
1351
+ "[102599 rows x 9 columns]"
1352
+ ]
1353
+ },
1354
+ "execution_count": 52,
1355
+ "metadata": {},
1356
+ "output_type": "execute_result"
1357
+ }
1358
+ ],
1359
+ "source": [
1360
+ "data"
1361
+ ]
1362
+ },
1363
+ {
1364
+ "cell_type": "code",
1365
+ "execution_count": 63,
1366
+ "id": "bebb9c93",
1367
+ "metadata": {},
1368
+ "outputs": [],
1369
+ "source": [
1370
+ "filtered_data = data[data.iloc[:, -3].notna()]"
1371
+ ]
1372
+ },
1373
+ {
1374
+ "cell_type": "code",
1375
+ "execution_count": 64,
1376
+ "id": "bd010fc9",
1377
+ "metadata": {},
1378
+ "outputs": [],
1379
+ "source": [
1380
+ "dict_representation = filtered_data.to_dict(orient='split')"
1381
+ ]
1382
+ },
1383
+ {
1384
+ "cell_type": "code",
1385
+ "execution_count": 71,
1386
+ "id": "e84db5c4",
1387
+ "metadata": {},
1388
+ "outputs": [
1389
+ {
1390
+ "data": {
1391
+ "text/plain": [
1392
+ "50468"
1393
+ ]
1394
+ },
1395
+ "execution_count": 71,
1396
+ "metadata": {},
1397
+ "output_type": "execute_result"
1398
+ }
1399
+ ],
1400
+ "source": [
1401
+ "len(dict_representation['data'])"
1402
+ ]
1403
+ },
1404
+ {
1405
+ "cell_type": "code",
1406
+ "execution_count": 67,
1407
+ "id": "31eaadf3",
1408
+ "metadata": {},
1409
+ "outputs": [],
1410
+ "source": [
1411
+ "sample_df = filtered_data.sample(frac=0.1)"
1412
+ ]
1413
+ },
1414
+ {
1415
+ "cell_type": "code",
1416
+ "execution_count": 69,
1417
+ "id": "33998ec6",
1418
+ "metadata": {},
1419
+ "outputs": [],
1420
+ "source": [
1421
+ "sample_df.to_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
1422
+ ]
1423
+ },
1424
+ {
1425
+ "cell_type": "code",
1426
+ "execution_count": 72,
1427
+ "id": "25396015",
1428
+ "metadata": {},
1429
+ "outputs": [
1430
+ {
1431
+ "data": {
1432
+ "text/plain": [
1433
+ "5047"
1434
+ ]
1435
+ },
1436
+ "execution_count": 72,
1437
+ "metadata": {},
1438
+ "output_type": "execute_result"
1439
+ }
1440
+ ],
1441
+ "source": [
1442
+ "len(sample_df)"
1443
+ ]
1444
+ },
1445
+ {
1446
+ "cell_type": "code",
1447
+ "execution_count": 3,
1448
+ "id": "17d054b5",
1449
+ "metadata": {},
1450
+ "outputs": [],
1451
+ "source": [
1452
+ "import pandas as pd\n",
1453
+ "data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
1454
+ ]
1455
+ },
1456
+ {
1457
+ "cell_type": "code",
1458
+ "execution_count": 4,
1459
+ "id": "64db8d6c",
1460
+ "metadata": {},
1461
+ "outputs": [],
1462
+ "source": [
1463
+ "data_dict = data.to_dict(orient = 'split')"
1464
+ ]
1465
+ },
1466
+ {
1467
+ "cell_type": "code",
1468
+ "execution_count": 21,
1469
+ "id": "b32b2f0c",
1470
+ "metadata": {},
1471
+ "outputs": [
1472
+ {
1473
+ "name": "stdout",
1474
+ "output_type": "stream",
1475
+ "text": [
1476
+ "0 Unnamed: 0.1\n",
1477
+ "1 Unnamed: 0\n",
1478
+ "2 NAME\n",
1479
+ "3 room type\n",
1480
+ "4 price\n",
1481
+ "5 minimum nights\n",
1482
+ "6 review rate number\n",
1483
+ "7 house_rules\n",
1484
+ "8 maximum occupancy\n",
1485
+ "9 city\n"
1486
+ ]
1487
+ }
1488
+ ],
1489
+ "source": [
1490
+ "for idx, unit in enumerate(data_dict['columns']):\n",
1491
+ " print(idx,unit)"
1492
+ ]
1493
+ },
1494
+ {
1495
+ "cell_type": "code",
1496
+ "execution_count": 8,
1497
+ "id": "fe415c1c",
1498
+ "metadata": {},
1499
+ "outputs": [
1500
+ {
1501
+ "data": {
1502
+ "text/plain": [
1503
+ "[0,\n",
1504
+ " 'Beautiful room upper manhttn.',\n",
1505
+ " 'Private room',\n",
1506
+ " 131.0,\n",
1507
+ " 1.0,\n",
1508
+ " 2.0,\n",
1509
+ " 'No smoking. No pets. ',\n",
1510
+ " 1,\n",
1511
+ " 'Christiansted']"
1512
+ ]
1513
+ },
1514
+ "execution_count": 8,
1515
+ "metadata": {},
1516
+ "output_type": "execute_result"
1517
+ }
1518
+ ],
1519
+ "source": [
1520
+ "data_dict['data'][0]"
1521
+ ]
1522
+ },
1523
+ {
1524
+ "cell_type": "code",
1525
+ "execution_count": 40,
1526
+ "id": "38cb5c5a",
1527
+ "metadata": {},
1528
+ "outputs": [],
1529
+ "source": [
1530
+ "import random\n",
1531
+ "new_data = []\n",
1532
+ "for idx, unit in enumerate(data_dict['data']):\n",
1533
+ " tmp_dict = {k:j for k,j in zip(['NAME','room type', 'price','minimum nights','review rate number','house_rules','maximum occupancy','city'],unit[1:])}\n",
1534
+ " if type(unit[4]) == str:\n",
1535
+ " tmp_dict[\"price\"] = eval(unit[4].replace(\"$\",\"\").replace(\",\",\"\"))\n",
1536
+ " house_rules_number = random.choice([0,1,1,1,2,2,3])\n",
1537
+ " tmp_dict['house_rules'] = \" & \".join(x for x in random.sample([\"No parties\",\"No smoking\",\"No children under 10\",\"No pets\",\"No visitors\"],house_rules_number))\n",
1538
+ " tmp_dict['city'] = tmp_dict['city'].split('/')[0]\n",
1539
+ " new_data.append(tmp_dict)"
1540
+ ]
1541
+ },
1542
+ {
1543
+ "cell_type": "code",
1544
+ "execution_count": 41,
1545
+ "id": "ae3d551e",
1546
+ "metadata": {},
1547
+ "outputs": [
1548
+ {
1549
+ "data": {
1550
+ "text/plain": [
1551
+ "{'NAME': 'BIG room with bath & balcony in BK!',\n",
1552
+ " 'room type': 'Private room',\n",
1553
+ " 'price': 1123.0,\n",
1554
+ " 'minimum nights': 1.0,\n",
1555
+ " 'review rate number': 4.0,\n",
1556
+ " 'house_rules': 'No parties',\n",
1557
+ " 'maximum occupancy': 2,\n",
1558
+ " 'city': 'Louisville'}"
1559
+ ]
1560
+ },
1561
+ "execution_count": 41,
1562
+ "metadata": {},
1563
+ "output_type": "execute_result"
1564
+ }
1565
+ ],
1566
+ "source": [
1567
+ "new_data[2]"
1568
+ ]
1569
+ },
1570
+ {
1571
+ "cell_type": "code",
1572
+ "execution_count": 42,
1573
+ "id": "6fac856c",
1574
+ "metadata": {},
1575
+ "outputs": [
1576
+ {
1577
+ "name": "stdout",
1578
+ "output_type": "stream",
1579
+ "text": [
1580
+ "\n",
1581
+ "----------\n",
1582
+ "No pets & No visitors & No smoking\n",
1583
+ "----------\n",
1584
+ "No parties & No visitors\n",
1585
+ "----------\n",
1586
+ "No children under 10 & No pets & No smoking\n",
1587
+ "----------\n",
1588
+ "No parties & No pets & No visitors\n",
1589
+ "----------\n",
1590
+ "No pets & No children under 10\n",
1591
+ "----------\n",
1592
+ "No children under 10 & No parties & No pets\n",
1593
+ "----------\n",
1594
+ "No visitors\n",
1595
+ "----------\n",
1596
+ "No parties & No children under 10\n",
1597
+ "----------\n",
1598
+ "No children under 10 & No smoking & No visitors\n",
1599
+ "----------\n",
1600
+ "No children under 10 & No parties & No smoking\n",
1601
+ "----------\n",
1602
+ "No pets & No smoking & No children under 10\n",
1603
+ "----------\n",
1604
+ "No pets & No visitors\n",
1605
+ "----------\n",
1606
+ "No visitors & No pets\n",
1607
+ "----------\n",
1608
+ "No children under 10 & No smoking & No pets\n",
1609
+ "----------\n",
1610
+ "No smoking & No parties & No pets\n",
1611
+ "----------\n",
1612
+ "No visitors & No children under 10 & No parties\n",
1613
+ "----------\n",
1614
+ "No parties & No children under 10 & No smoking\n",
1615
+ "----------\n",
1616
+ "No visitors & No children under 10 & No smoking\n",
1617
+ "----------\n",
1618
+ "No pets & No parties\n",
1619
+ "----------\n",
1620
+ "No smoking & No parties\n",
1621
+ "----------\n",
1622
+ "No smoking & No children under 10\n",
1623
+ "----------\n",
1624
+ "No parties & No children under 10 & No visitors\n",
1625
+ "----------\n",
1626
+ "No children under 10 & No smoking\n",
1627
+ "----------\n",
1628
+ "No visitors & No pets & No smoking\n",
1629
+ "----------\n",
1630
+ "No pets\n",
1631
+ "----------\n",
1632
+ "No children under 10 & No pets\n",
1633
+ "----------\n",
1634
+ "No visitors & No smoking\n",
1635
+ "----------\n",
1636
+ "No smoking\n",
1637
+ "----------\n",
1638
+ "No parties & No smoking & No children under 10\n",
1639
+ "----------\n",
1640
+ "No parties & No smoking\n",
1641
+ "----------\n",
1642
+ "No smoking & No visitors & No parties\n",
1643
+ "----------\n",
1644
+ "No pets & No smoking\n",
1645
+ "----------\n",
1646
+ "No pets & No smoking & No parties\n",
1647
+ "----------\n",
1648
+ "No smoking & No children under 10 & No visitors\n",
1649
+ "----------\n",
1650
+ "No parties & No smoking & No visitors\n",
1651
+ "----------\n",
1652
+ "No visitors & No parties\n",
1653
+ "----------\n",
1654
+ "No visitors & No children under 10\n",
1655
+ "----------\n",
1656
+ "No parties & No smoking & No pets\n",
1657
+ "----------\n",
1658
+ "No children under 10 & No pets & No visitors\n",
1659
+ "----------\n",
1660
+ "No smoking & No pets & No parties\n",
1661
+ "----------\n",
1662
+ "No children under 10 & No smoking & No parties\n",
1663
+ "----------\n",
1664
+ "No visitors & No children under 10 & No pets\n",
1665
+ "----------\n",
1666
+ "No children under 10 & No parties\n",
1667
+ "----------\n",
1668
+ "No pets & No parties & No visitors\n",
1669
+ "----------\n",
1670
+ "No children under 10 & No visitors & No parties\n",
1671
+ "----------\n",
1672
+ "No parties & No pets\n",
1673
+ "----------\n",
1674
+ "No visitors & No parties & No pets\n",
1675
+ "----------\n",
1676
+ "No smoking & No pets & No visitors\n",
1677
+ "----------\n",
1678
+ "No smoking & No pets\n",
1679
+ "----------\n",
1680
+ "No visitors & No smoking & No children under 10\n",
1681
+ "----------\n",
1682
+ "No pets & No children under 10 & No parties\n",
1683
+ "----------\n",
1684
+ "No visitors & No pets & No children under 10\n",
1685
+ "----------\n",
1686
+ "No pets & No children under 10 & No smoking\n",
1687
+ "----------\n",
1688
+ "No parties & No visitors & No children under 10\n",
1689
+ "----------\n",
1690
+ "No pets & No smoking & No visitors\n",
1691
+ "----------\n",
1692
+ "No pets & No parties & No smoking\n",
1693
+ "----------\n",
1694
+ "No parties & No visitors & No smoking\n",
1695
+ "----------\n",
1696
+ "No pets & No visitors & No children under 10\n",
1697
+ "----------\n",
1698
+ "No parties & No visitors & No pets\n",
1699
+ "----------\n",
1700
+ "No children under 10\n",
1701
+ "----------\n",
1702
+ "No children under 10 & No pets & No parties\n",
1703
+ "----------\n",
1704
+ "No children under 10 & No visitors & No smoking\n",
1705
+ "----------\n",
1706
+ "No smoking & No children under 10 & No parties\n",
1707
+ "----------\n",
1708
+ "No pets & No parties & No children under 10\n",
1709
+ "----------\n",
1710
+ "No children under 10 & No visitors & No pets\n",
1711
+ "----------\n",
1712
+ "No parties & No pets & No smoking\n",
1713
+ "----------\n",
1714
+ "No pets & No children under 10 & No visitors\n",
1715
+ "----------\n",
1716
+ "No parties & No children under 10 & No pets\n",
1717
+ "----------\n",
1718
+ "No parties & No pets & No children under 10\n",
1719
+ "----------\n",
1720
+ "No smoking & No parties & No visitors\n",
1721
+ "----------\n",
1722
+ "No parties\n",
1723
+ "----------\n",
1724
+ "No visitors & No pets & No parties\n",
1725
+ "----------\n",
1726
+ "No children under 10 & No visitors\n",
1727
+ "----------\n",
1728
+ "No smoking & No children under 10 & No pets\n",
1729
+ "----------\n",
1730
+ "No smoking & No parties & No children under 10\n",
1731
+ "----------\n",
1732
+ "No visitors & No smoking & No parties\n",
1733
+ "----------\n",
1734
+ "No pets & No visitors & No parties\n",
1735
+ "----------\n",
1736
+ "No smoking & No visitors\n",
1737
+ "----------\n",
1738
+ "No smoking & No visitors & No children under 10\n",
1739
+ "----------\n",
1740
+ "No visitors & No smoking & No pets\n",
1741
+ "----------\n",
1742
+ "No smoking & No visitors & No pets\n",
1743
+ "----------\n",
1744
+ "No visitors & No parties & No smoking\n",
1745
+ "----------\n",
1746
+ "No smoking & No pets & No children under 10\n",
1747
+ "----------\n",
1748
+ "No children under 10 & No parties & No visitors\n",
1749
+ "----------\n",
1750
+ "No visitors & No parties & No children under 10\n",
1751
+ "----------\n"
1752
+ ]
1753
+ }
1754
+ ],
1755
+ "source": [
1756
+ "maximum_occupancy_set = set()\n",
1757
+ "for unit in new_data:\n",
1758
+ " maximum_occupancy_set.add(unit['house_rules'])\n",
1759
+ "for unit in maximum_occupancy_set:\n",
1760
+ " print(unit)\n",
1761
+ " print(\"----------\")"
1762
+ ]
1763
+ },
1764
+ {
1765
+ "cell_type": "code",
1766
+ "execution_count": 45,
1767
+ "id": "8056052a",
1768
+ "metadata": {},
1769
+ "outputs": [
1770
+ {
1771
+ "data": {
1772
+ "text/html": [
1773
+ "<div>\n",
1774
+ "<style scoped>\n",
1775
+ " .dataframe tbody tr th:only-of-type {\n",
1776
+ " vertical-align: middle;\n",
1777
+ " }\n",
1778
+ "\n",
1779
+ " .dataframe tbody tr th {\n",
1780
+ " vertical-align: top;\n",
1781
+ " }\n",
1782
+ "\n",
1783
+ " .dataframe thead th {\n",
1784
+ " text-align: right;\n",
1785
+ " }\n",
1786
+ "</style>\n",
1787
+ "<table border=\"1\" class=\"dataframe\">\n",
1788
+ " <thead>\n",
1789
+ " <tr style=\"text-align: right;\">\n",
1790
+ " <th></th>\n",
1791
+ " <th>NAME</th>\n",
1792
+ " <th>room type</th>\n",
1793
+ " <th>price</th>\n",
1794
+ " <th>minimum nights</th>\n",
1795
+ " <th>review rate number</th>\n",
1796
+ " <th>house_rules</th>\n",
1797
+ " <th>maximum occupancy</th>\n",
1798
+ " <th>city</th>\n",
1799
+ " </tr>\n",
1800
+ " </thead>\n",
1801
+ " <tbody>\n",
1802
+ " <tr>\n",
1803
+ " <th>0</th>\n",
1804
+ " <td>Beautiful room upper manhttn.</td>\n",
1805
+ " <td>Private room</td>\n",
1806
+ " <td>131.0</td>\n",
1807
+ " <td>1.0</td>\n",
1808
+ " <td>2.0</td>\n",
1809
+ " <td>No smoking</td>\n",
1810
+ " <td>1</td>\n",
1811
+ " <td>Christiansted</td>\n",
1812
+ " </tr>\n",
1813
+ " <tr>\n",
1814
+ " <th>1</th>\n",
1815
+ " <td>Roomy and Comftable Room</td>\n",
1816
+ " <td>Private room</td>\n",
1817
+ " <td>548.0</td>\n",
1818
+ " <td>10.0</td>\n",
1819
+ " <td>5.0</td>\n",
1820
+ " <td>No children under 10 &amp; No parties</td>\n",
1821
+ " <td>2</td>\n",
1822
+ " <td>Laredo</td>\n",
1823
+ " </tr>\n",
1824
+ " <tr>\n",
1825
+ " <th>2</th>\n",
1826
+ " <td>BIG room with bath &amp; balcony in BK!</td>\n",
1827
+ " <td>Private room</td>\n",
1828
+ " <td>1123.0</td>\n",
1829
+ " <td>1.0</td>\n",
1830
+ " <td>4.0</td>\n",
1831
+ " <td>No parties</td>\n",
1832
+ " <td>2</td>\n",
1833
+ " <td>Louisville</td>\n",
1834
+ " </tr>\n",
1835
+ " <tr>\n",
1836
+ " <th>3</th>\n",
1837
+ " <td>4A-</td>\n",
1838
+ " <td>Entire home/apt</td>\n",
1839
+ " <td>225.0</td>\n",
1840
+ " <td>30.0</td>\n",
1841
+ " <td>4.0</td>\n",
1842
+ " <td>No pets</td>\n",
1843
+ " <td>3</td>\n",
1844
+ " <td>Greensboro</td>\n",
1845
+ " </tr>\n",
1846
+ " <tr>\n",
1847
+ " <th>4</th>\n",
1848
+ " <td>Nice and Comfortable Private Room</td>\n",
1849
+ " <td>Private room</td>\n",
1850
+ " <td>761.0</td>\n",
1851
+ " <td>2.0</td>\n",
1852
+ " <td>1.0</td>\n",
1853
+ " <td>No smoking &amp; No parties</td>\n",
1854
+ " <td>2</td>\n",
1855
+ " <td>Cape Girardeau</td>\n",
1856
+ " </tr>\n",
1857
+ " <tr>\n",
1858
+ " <th>...</th>\n",
1859
+ " <td>...</td>\n",
1860
+ " <td>...</td>\n",
1861
+ " <td>...</td>\n",
1862
+ " <td>...</td>\n",
1863
+ " <td>...</td>\n",
1864
+ " <td>...</td>\n",
1865
+ " <td>...</td>\n",
1866
+ " <td>...</td>\n",
1867
+ " </tr>\n",
1868
+ " <tr>\n",
1869
+ " <th>5042</th>\n",
1870
+ " <td>Amazing LOFT in Prime Williamsburg</td>\n",
1871
+ " <td>Private room</td>\n",
1872
+ " <td>249.0</td>\n",
1873
+ " <td>5.0</td>\n",
1874
+ " <td>5.0</td>\n",
1875
+ " <td>No pets</td>\n",
1876
+ " <td>2</td>\n",
1877
+ " <td>Trenton</td>\n",
1878
+ " </tr>\n",
1879
+ " <tr>\n",
1880
+ " <th>5043</th>\n",
1881
+ " <td>Private Queen Bedroom in Brooklyn</td>\n",
1882
+ " <td>Private room</td>\n",
1883
+ " <td>1032.0</td>\n",
1884
+ " <td>1.0</td>\n",
1885
+ " <td>1.0</td>\n",
1886
+ " <td>No pets</td>\n",
1887
+ " <td>1</td>\n",
1888
+ " <td>Des Moines</td>\n",
1889
+ " </tr>\n",
1890
+ " <tr>\n",
1891
+ " <th>5044</th>\n",
1892
+ " <td>Bushwick / Bed Sty Retreat</td>\n",
1893
+ " <td>Private room</td>\n",
1894
+ " <td>546.0</td>\n",
1895
+ " <td>2.0</td>\n",
1896
+ " <td>4.0</td>\n",
1897
+ " <td>No children under 10 &amp; No visitors &amp; No smoking</td>\n",
1898
+ " <td>2</td>\n",
1899
+ " <td>Scottsbluff</td>\n",
1900
+ " </tr>\n",
1901
+ " <tr>\n",
1902
+ " <th>5045</th>\n",
1903
+ " <td>Charming Mid-Century Studio</td>\n",
1904
+ " <td>Entire home/apt</td>\n",
1905
+ " <td>1115.0</td>\n",
1906
+ " <td>2.0</td>\n",
1907
+ " <td>5.0</td>\n",
1908
+ " <td>No pets &amp; No children under 10</td>\n",
1909
+ " <td>7</td>\n",
1910
+ " <td>Butte</td>\n",
1911
+ " </tr>\n",
1912
+ " <tr>\n",
1913
+ " <th>5046</th>\n",
1914
+ " <td>3 Bed/ 2 Bath Full Apt. BK Heights</td>\n",
1915
+ " <td>Entire home/apt</td>\n",
1916
+ " <td>396.0</td>\n",
1917
+ " <td>2.0</td>\n",
1918
+ " <td>1.0</td>\n",
1919
+ " <td>No smoking</td>\n",
1920
+ " <td>3</td>\n",
1921
+ " <td>Norfolk</td>\n",
1922
+ " </tr>\n",
1923
+ " </tbody>\n",
1924
+ "</table>\n",
1925
+ "<p>5047 rows × 8 columns</p>\n",
1926
+ "</div>"
1927
+ ],
1928
+ "text/plain": [
1929
+ " NAME room type price \n",
1930
+ "0 Beautiful room upper manhttn. Private room 131.0 \\\n",
1931
+ "1 Roomy and Comftable Room Private room 548.0 \n",
1932
+ "2 BIG room with bath & balcony in BK! Private room 1123.0 \n",
1933
+ "3 4A- Entire home/apt 225.0 \n",
1934
+ "4 Nice and Comfortable Private Room Private room 761.0 \n",
1935
+ "... ... ... ... \n",
1936
+ "5042 Amazing LOFT in Prime Williamsburg Private room 249.0 \n",
1937
+ "5043 Private Queen Bedroom in Brooklyn Private room 1032.0 \n",
1938
+ "5044 Bushwick / Bed Sty Retreat Private room 546.0 \n",
1939
+ "5045 Charming Mid-Century Studio Entire home/apt 1115.0 \n",
1940
+ "5046 3 Bed/ 2 Bath Full Apt. BK Heights Entire home/apt 396.0 \n",
1941
+ "\n",
1942
+ " minimum nights review rate number \n",
1943
+ "0 1.0 2.0 \\\n",
1944
+ "1 10.0 5.0 \n",
1945
+ "2 1.0 4.0 \n",
1946
+ "3 30.0 4.0 \n",
1947
+ "4 2.0 1.0 \n",
1948
+ "... ... ... \n",
1949
+ "5042 5.0 5.0 \n",
1950
+ "5043 1.0 1.0 \n",
1951
+ "5044 2.0 4.0 \n",
1952
+ "5045 2.0 5.0 \n",
1953
+ "5046 2.0 1.0 \n",
1954
+ "\n",
1955
+ " house_rules maximum occupancy \n",
1956
+ "0 No smoking 1 \\\n",
1957
+ "1 No children under 10 & No parties 2 \n",
1958
+ "2 No parties 2 \n",
1959
+ "3 No pets 3 \n",
1960
+ "4 No smoking & No parties 2 \n",
1961
+ "... ... ... \n",
1962
+ "5042 No pets 2 \n",
1963
+ "5043 No pets 1 \n",
1964
+ "5044 No children under 10 & No visitors & No smoking 2 \n",
1965
+ "5045 No pets & No children under 10 7 \n",
1966
+ "5046 No smoking 3 \n",
1967
+ "\n",
1968
+ " city \n",
1969
+ "0 Christiansted \n",
1970
+ "1 Laredo \n",
1971
+ "2 Louisville \n",
1972
+ "3 Greensboro \n",
1973
+ "4 Cape Girardeau \n",
1974
+ "... ... \n",
1975
+ "5042 Trenton \n",
1976
+ "5043 Des Moines \n",
1977
+ "5044 Scottsbluff \n",
1978
+ "5045 Butte \n",
1979
+ "5046 Norfolk \n",
1980
+ "\n",
1981
+ "[5047 rows x 8 columns]"
1982
+ ]
1983
+ },
1984
+ "execution_count": 45,
1985
+ "metadata": {},
1986
+ "output_type": "execute_result"
1987
+ }
1988
+ ],
1989
+ "source": [
1990
+ "df"
1991
+ ]
1992
+ },
1993
+ {
1994
+ "cell_type": "code",
1995
+ "execution_count": 44,
1996
+ "id": "54423e0d",
1997
+ "metadata": {},
1998
+ "outputs": [],
1999
+ "source": [
2000
+ "df = pd.DataFrame(new_data)\n",
2001
+ "df.to_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
2002
+ ]
2003
+ },
2004
+ {
2005
+ "cell_type": "code",
2006
+ "execution_count": null,
2007
+ "id": "5767aa80",
2008
+ "metadata": {},
2009
+ "outputs": [],
2010
+ "source": [
2011
+ "df.rename(columns={'old_name1': 'new_name1', 'old_name2': 'new_name2'}, inplace=True)\n",
2012
+ "df.to_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
2013
+ ]
2014
+ }
2015
+ ],
2016
+ "metadata": {
2017
+ "kernelspec": {
2018
+ "display_name": "Python 3 (ipykernel)",
2019
+ "language": "python",
2020
+ "name": "python3"
2021
+ },
2022
+ "language_info": {
2023
+ "codemirror_mode": {
2024
+ "name": "ipython",
2025
+ "version": 3
2026
+ },
2027
+ "file_extension": ".py",
2028
+ "mimetype": "text/x-python",
2029
+ "name": "python",
2030
+ "nbconvert_exporter": "python",
2031
+ "pygments_lexer": "ipython3",
2032
+ "version": "3.9.16"
2033
+ }
2034
+ },
2035
+ "nbformat": 4,
2036
+ "nbformat_minor": 5
2037
+ }
tools/accommodations/test.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tools.accommodations.apis import Hotels
2
+ import pandas as pd
3
+
4
+ # 设置显示所有列
5
+ pd.set_option('display.max_columns', 100)
6
+
7
+ # 设置显示所有行
8
+ pd.set_option('display.max_rows', 100)
9
+
10
+ hotel = Hotels('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')
11
+ data = hotel.run('New York')
12
+ print(data)
tools/attractions/__pycache__/apis.cpython-39.pyc ADDED
Binary file (1.55 kB). View file
 
tools/attractions/apis.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pandas import DataFrame
3
+ from typing import Optional
4
+ from annotation.src.utils import extract_before_parenthesis
5
+
6
+
7
+ class Attractions:
8
+ def __init__(self, path="../database/attractions/attractions.csv"):
9
+ self.path = path
10
+ self.data = pd.read_csv(self.path).dropna()[['Name','Latitude','Longitude','Address','Phone','Website',"City"]]
11
+ print("Attractions loaded.")
12
+
13
+ def load_db(self):
14
+ self.data = pd.read_csv(self.path)
15
+
16
+ def run(self,
17
+ city: str,
18
+ ) -> DataFrame:
19
+ """Search for Accommodations by city and date."""
20
+ results = self.data[self.data["City"] == city]
21
+ # the results should show the index
22
+ results = results.reset_index(drop=True)
23
+ if len(results) == 0:
24
+ return "There is no attraction in this city."
25
+ return results
26
+
27
+ def run_for_annotation(self,
28
+ city: str,
29
+ ) -> DataFrame:
30
+ """Search for Accommodations by city and date."""
31
+ results = self.data[self.data["City"] == extract_before_parenthesis(city)]
32
+ # the results should show the index
33
+ results = results.reset_index(drop=True)
34
+ return results
tools/attractions/test.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tools.attractions.apis import Attractions
2
+ import pandas as pd
3
+ import sys
4
+ import os
5
+ sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
6
+ os.chdir(os.path.dirname(os.path.abspath(__file__)))
7
+ unique_cities = Attractions(path="../../database/attractions/attractions.csv").data['City'].unique()
8
+ df = Attractions(path="../../database/attractions/attractions.csv").data
9
+ print(len(df))
10
+ citySet = open('../../database/background/citySet.txt','r').read().split('\n')
11
+ cnt = 0
12
+ for city in unique_cities:
13
+ if city not in citySet:
14
+ df = df[df['City'] != city]
15
+ print(len(df))
16
+
17
+ df.to_csv('../../database/attractions/attractions2.csv', index=False)
tools/cities/__pycache__/apis.cpython-39.pyc ADDED
Binary file (1.1 kB). View file
 
tools/cities/apis.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pandas import DataFrame
2
+
3
+ class Cities:
4
+ def __init__(self ,path="../database/background/citySet_with_states.txt") -> None:
5
+ self.path = path
6
+ self.load_data()
7
+ print("Cities loaded.")
8
+
9
+ def load_data(self):
10
+ cityStateMapping = open(self.path, "r").read().strip().split("\n")
11
+ self.data = {}
12
+ for unit in cityStateMapping:
13
+ city, state = unit.split("\t")
14
+ if state not in self.data:
15
+ self.data[state] = [city]
16
+ else:
17
+ self.data[state].append(city)
18
+
19
+ def run(self, state) -> dict:
20
+ if state not in self.data:
21
+ return ValueError("Invalid State")
22
+ else:
23
+ return self.data[state]
tools/cities/test.py ADDED
File without changes