AliMustapha commited on
Commit
43d233e
·
1 Parent(s): 1ffb898
Files changed (1) hide show
  1. dt.py +0 -386
dt.py DELETED
@@ -1,386 +0,0 @@
1
- # from Dictionary_guesser.name_nation_guesser import NameNationGuesser
2
- # import datetime
3
- # from GitScraping import CommitInfo
4
- # if __name__ == "__main__":
5
-
6
- # guesser =NameNationGuesser(names_filename="Dictionary_guesser/names.csv",places_filename='Dictionary_guesser/places.tab', guess_first_second_min_mag=None,place_column_name="sub-region")
7
- # commit_info=CommitInfo("https://github.com/AhmadM-DL/On-Learning-Implicit-Protected-Attributes")
8
- # df,first_commit_dates = commit_info.get_first_commit_dates()
9
-
10
- # def guess_zone(name, epoch, offset):
11
- # dt = datetime.datetime.fromtimestamp(epoch)
12
- # country_pop_map = guesser.country_pop_from_datetime(dt, offset)
13
- # # print(country_pop_map)
14
- # return guesser.guess_zone(name, country_pop_map=country_pop_map)
15
- # first_commit_dates['Commit_Seconds'] = first_commit_dates['First_Commit_Date'].apply(lambda x: x.timestamp())
16
- # first_commit_dates['Author_Timezone'] = first_commit_dates['Author_Timezone'] /60
17
-
18
-
19
- # first_commit_dates['region_Dictionary'] = first_commit_dates.apply(lambda row: guess_zone(row['Author'],row['Commit_Seconds'], row['Author_Timezone']), axis=1)
20
- # print(first_commit_dates)
21
-
22
- from google.cloud import storage
23
- import json
24
- import os
25
- import pandas as pd
26
- # # Initialize a client
27
- # jsonApi = os.getenv('apiKey')
28
- # print(jsonApi)
29
-
30
- # export jsonApi='{
31
- # "type": "service_account",
32
- # "project_id": "kinetic-guild-369323",
33
- # "private_key_id": "b06a3ad76990da0e6970c072e95e7d26bb2e8c1d",
34
- # "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDTahMlxsz8Zv1T\nY3+C4E5MS6SnP7ESeAKdhm6IHiELrvekmPsTkaZhf1UWpOuzj76iklWwocVpnDCw\nINn0BAg/ttBKA24PQzTinsw2C5gRtB4J36n5rLZ2wmLw8HkLXsm2z7w+4h0VHLCN\nIwp1+L/AS967nQ5qXzf+AR47RoKcMY7Ia2WzF8Cv+/rBsVbl2w+Mz7xTSnZl8neg\nIaL4QMguZWHR6W9Hc/lt1+ZSoDgZmqj+DjXd5NPwNfckPaX5nppz/kzmfuUkWI8U\njBv6KyqBKcmdr1aJMc8XH64y7BW6pmH7WEJAN/H/uiycj1b09Lr27BUwogbr/0Ae\n6exf0lnZAgMBAAECggEAL2foPj7LRUe0w0ea1paEiCAoHiauhoUplPgJffU/lLaZ\nqitxlWxCAjfCtS6q+ZsgdKTamR5VPX67/iqHpOtojBzqrMYDHmIEEFLqWK4V3dZl\nK/Ke0zESwyOIex15Dv8kvRzsya77NXo27pbuaBCssqpwmeI4UsriK89FX6ZKcEpV\n7xMJgOm9WA0OrPsO6GFVF5htvTh0QFuoq1kJDiQguOrez9qa+F52PXl4RArwCSeK\nbchQhHd6ASXCyRB+Bx39Vh62Xv6xJ6LiEsCC41gzH6jHAyFZZ/v2mFAlHnUg9qIN\nJKdQM5zXFA1dUB/l17k8BWu+Achanegd7gNxv1prrwKBgQD6AEgxYrVT5sChZjP2\nvzQIYNlx+rft8e/MsOjKzl+9ObS+1xuhTlekZnxhtRm8vAoD7XM4Rb4fDJWwR2vq\naBVvKt93Eg68gWsPOmqkdPkPLiW877VU4QLk07yEM6Nl/OqdlRF0EzKXHXmk4AVL\njSh0MLbc7McmoLVgre33m754IwKBgQDYfMKo4f8bev+91kXMTlTq/Kgpyyaqwcsq\nibyNYAtXdwknZW0iOhA9lQZADH9vG9QELspi0Zy2Uv1LLVk1cGJ6up8eVpWFzZ2S\nSgLZJ6WuqK6OcewqWFA/WQ3U94lKgdnWqT/rDSHgnw3kSmyEiQ+IL0zKa8IzuV0F\nRGqj4Ngn0wKBgQD1NMGabs6bdIELzUq6gd9vOE8O1HMDF4G0qvApuzF8T9VQOXwI\nQucDgOIOk6qiy2ynXYbdcsp/ecB4HhVi3KPpXYvBJhz+F5ICZbGjjHecxA6Pui2J\nCwnjlyoYIO3rYp5b4ZI033+HaImfhXqsF8/N5tn05uiOoqJEKVR2wHOZMQKBgBRy\nhDhLUDsaPPmDOYh4hZDEWGXKKFbMgxH7fHGl9qxGM/kinVI0RcBrSPHXvFmUOUxD\n1x3KSpD1+bKWD+z6NnL9GXZWGz1OFGnyz54PHpkGmaYeoH3HZZz2HlZVIwSEizy5\nM65RyTdcDoXXebRy9aKZRRmBYBBem6iZs7DS1de9AoGAFf/tR1HK4Cugh9vebzp0\nB5j7EJP1XESDKsGAOIFC7dereuDNHMDmRH72BMYSBvrfAY77mDzEpW1TGK9Qxch3\nvm1tKCdZTnYSMoq0nbc/QIFyn20StR6OD+0nS94NN8IpGM882D7fWITrhn4XrZe3\nrdE4C0JqAQ6BKL0ka4j93eQ=\n-----END PRIVATE KEY-----\n",
35
- # "client_email": "[email protected]",
36
- # "client_id": "102736498211031284416",
37
- # "auth_uri": "https://accounts.google.com/o/oauth2/auth",
38
- # "token_uri": "https://oauth2.googleapis.com/token",
39
- # "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
40
- # "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/geogenderali%40kinetic-guild-369323.iam.gserviceaccount.com",
41
- # "universe_domain": "googleapis.com"
42
- # }'
43
- #
44
- # os.environ['apiKey'] = json_string
45
- #
46
-
47
- jsonApi = os.getenv('jsonApi')
48
- bucket_name = os.getenv('bucket_name')
49
- file_name = os.getenv('file_name')
50
- print(file_name)
51
- print(bucket_name)
52
- print(jsonApi)
53
- service_account_info = json.loads(jsonApi)
54
- client = storage.Client.from_service_account_info(service_account_info)
55
-
56
- blob = client.get_bucket(bucket_name).blob(file_name)
57
-
58
- with blob.open("r") as file:
59
- df = pd.read_csv(file,sep="\t")
60
-
61
- print(df.head())
62
- # Now df contains the data from the CSV file
63
- print(df.head())
64
-
65
-
66
-
67
-
68
-
69
-
70
-
71
-
72
-
73
-
74
-
75
-
76
-
77
-
78
-
79
-
80
-
81
-
82
-
83
-
84
-
85
- #!/usr/bin/env python3
86
-
87
- __copyright__ = "Copyright (C) 2022 Davide Rossi"
88
- __license__ = "GPL-3.0-or-later"
89
-
90
- import pandas as pd
91
- import numpy as np
92
- import re
93
- import random
94
- import pytz
95
- import datetime
96
- import time
97
- import regex
98
- import click
99
- import code
100
- import logging
101
- from enum import Enum
102
- import csv
103
- from unidecode import unidecode
104
- from collections import defaultdict
105
-
106
-
107
- class Algorithms(str,Enum):
108
- AVG = 'avg'
109
- PROD = 'prod'
110
- class NameNationGuesser:
111
- WORD_RE = re.compile(r'\W+')
112
- CAMEL_RE = re.compile(r'([A-Z][a-z]+)')
113
- UPPER_RE = re.compile(r'([A-Z]+)')
114
- UNDER_RE = re.compile(r'(_)')
115
- LEADING_BLANKS_RE = re.compile(r'^\s*')
116
- DEFAULT_PLACES_FILENAME = 'places.tab'
117
- DEFAULT_NAMES_FILENAME = 'names_codes.tab'
118
- DEFAULT_GUESS_FIRST_SECOND_MIN_MAG = None
119
- DEFAULT_ALGORITHM = Algorithms.AVG
120
- DEFAULT_COLUMN_NAME="zone"
121
-
122
- def __init__(self, places_filename=DEFAULT_PLACES_FILENAME, names_filename=DEFAULT_NAMES_FILENAME, algorithm=DEFAULT_ALGORITHM, guess_first_second_min_mag=DEFAULT_GUESS_FIRST_SECOND_MIN_MAG,place_column_name=DEFAULT_COLUMN_NAME):
123
- self.zone_by_place = {}
124
- self.pop_by_place = {}
125
- self.min_freq_by_place = {}
126
- self.cumsum_population_by_code = {}
127
- self.name_rows_by_name = defaultdict(list)
128
- self.all_timezones = None
129
- self.places_data = None
130
- self.names_data = None
131
- self.names_data_empty = None
132
- self.names_data_col_names = ['name', 'type', 'code', 'frequency', 'gender']
133
- self.names_data_dtype = {'frequency': float}
134
- self.names_data_by_name = None
135
- self.guess_first_second_min_mag = guess_first_second_min_mag
136
- self.algorithm = algorithm
137
- self.place_column_name=place_column_name
138
- self.places_data = pd.read_csv(places_filename, sep='\t', header=0, keep_default_na=False, na_values='',
139
- names=['country', 'state_name', 'region', 'un_subregion', 'zone', 'timezone', 'population', 'sovereignty_numeric', 'sovereignty', 'code', 'code3', 'code_num', 'cctdl',"_",'sub-region'],
140
- dtype={'population':int, 'sovereignty_numeric':int, 'code_num':int})
141
-
142
- self.names_data = self.__read_names_data(names_filename)
143
- self.names_data_empty = pd.DataFrame().reindex(columns=self.names_data.columns)
144
-
145
- @classmethod
146
- def advanced_splitter(cls, seq):
147
- """Split words separated by spaces or using CamelNotation"""
148
- return cls.WORD_RE.split(cls.LEADING_BLANKS_RE.sub(r'', cls.CAMEL_RE.sub(r' \1', cls.UPPER_RE.sub(r' \1', cls.UNDER_RE.sub(r' ', seq)))))
149
-
150
- def __get_cumsum_population(self, code):
151
- if code not in self.cumsum_population_by_code:
152
- df = self.places_data
153
- #create a data frame for a specific code with an ordered cumsum population
154
- df_code = df[(df['code'] == code) & (df['population'] != 0)][['timezone', 'population']]
155
- population = df_code['population'].sum()
156
- df_code = df_code.sort_values('population')
157
- df_code['population'] = df_code['population'].cumsum()
158
- self.cumsum_population_by_code[code] = (population, df_code.copy().reset_index(drop=True))
159
- return self.cumsum_population_by_code[code]
160
-
161
- def compatible_datetime_offset(self, code):
162
- #extract a random timezone, with a chance proportional to the population of the people in that timezone
163
- population, df_code = self.__get_cumsum_population(code)
164
- timezone_name = df_code[df_code['population'] >= random.randrange(population)].iloc[0]['timezone']
165
- valid_time = False
166
- while not valid_time:
167
- valid_time = True
168
- #create a random datetime from 1/1/1970 to now
169
- current_epoch = time.time()
170
- epoch = random.uniform(0, current_epoch)
171
- dt = datetime.datetime.fromtimestamp(epoch)
172
- #localize the datetime using the timezone and calculate its UTC offset
173
- timezone = pytz.timezone(timezone_name)
174
- try: #it may not work because of an ambiguous or unexistent time for that timezone in that date
175
- offset = int(timezone.utcoffset(dt).total_seconds()/60)
176
- except:
177
- valid_time = False
178
- return dt, offset
179
- def is_roman_language(self,text):
180
- roman_pattern = r'^\p{Latin}+$'
181
- match = regex.match(roman_pattern, text, flags=regex.UNICODE)
182
- return match is not None
183
-
184
- def text_to_romanize(self,text):
185
- text=str(text)
186
- translator = str.maketrans(r"-._\/+", " ")
187
- text= text.translate(translator)
188
- if not self.is_roman_language(text):
189
- return unidecode(text)
190
- else :
191
- return text
192
- def __read_names_data(self, names_filename):
193
- self.names_data_by_name = defaultdict(list)
194
- names = self.names_data_col_names
195
- name_pos = names.index('name')
196
- dtype = self.names_data_dtype
197
- rows = []
198
- with open(names_filename, "r") as file:
199
- reader = csv.reader(file, delimiter='\t')
200
- next(reader)
201
- for row in reader:
202
- name = row[name_pos].lower()
203
- row[name_pos] = name
204
- rows.append(row)
205
- self.name_rows_by_name[name].append(row)
206
- names_data = pd.DataFrame(rows)
207
- names_data.columns = names
208
- names_data = names_data.astype(dtype)
209
-
210
- return names_data
211
-
212
- def place_population(self, code):
213
- if code not in self.pop_by_place:
214
- self.pop_by_place[code] = self.places_data[self.places_data.code == code].population.sum()
215
- return self.pop_by_place[code]
216
-
217
- def min_frequency(self, code):
218
- if code not in self.min_freq_by_place:
219
- # min_freq_dict[code] = names_data[names_data.code == code]['frequency'].min()
220
- # self.min_freq_by_place[code] = self.names_data['frequency'].min()
221
- self.min_freq_by_place[code] = self.names_data[(self.names_data['code'] == code) & (self.names_data['frequency'] > 0)]['frequency'].min()
222
- return self.min_freq_by_place[code]
223
-
224
- def name_data_for_name(self, name):
225
- if name in self.names_data_by_name:
226
- return self.names_data_by_name[name]
227
- elif name in self.name_rows_by_name:
228
- self.names_data_by_name[name] = pd.DataFrame(self.name_rows_by_name[name])
229
- self.names_data_by_name[name].columns = self.names_data_col_names
230
- self.names_data_by_name[name] = self.names_data_by_name[name].astype(self.names_data_dtype)
231
- return self.names_data_by_name[name]
232
- else:
233
- return self.names_data_empty
234
- # return names_data_by_name[name] if name in names_data_by_name else names_data_empty #that is deadly slow, it's better to create a new data frame for each name
235
-
236
- def get_all_timezones(self):
237
- df = self.places_data
238
- if self.all_timezones is None:
239
- self.all_timezones = list(df[(df['timezone'].notnull()) & (df['population'] > 0)]['timezone'].unique())
240
- return self.all_timezones
241
-
242
- def country_pop_from_datetime(self, dt, offset):
243
- df = self.places_data
244
- places_pop = {}
245
- for tz in self.get_all_timezones():
246
- timezone = pytz.timezone(tz)
247
- try:
248
- timezone_offset = timezone.utcoffset(dt).total_seconds() // 60
249
- except pytz.exceptions.AmbiguousTimeError:
250
- timezone_offset = timezone.utcoffset(dt, is_dst=True).total_seconds() // 60
251
- except pytz.exceptions.NonExistentTimeError:
252
- timezone_offset = None
253
- if timezone_offset == offset:
254
- df_tz_pop = df[df['timezone'] == tz].iloc[0]
255
- population = df_tz_pop['population']
256
- code = df_tz_pop['code']
257
- places_pop[code] = population
258
- return places_pop
259
-
260
- def score_a_name_part(self, name, countries=None, country_pop_map=None):
261
- if countries is not None and country_pop_map is not None:
262
- raise ValueError(f'At least one of countries and country_pop_map must be None')
263
- name_data = self.name_data_for_name(name)
264
- if country_pop_map is not None:
265
- countries = list(country_pop_map.keys())
266
- if countries is not None:
267
- name_data = name_data[name_data.code.isin(countries)]
268
- score_dict = {}
269
- for code, _, frequency in zip(name_data.code, name_data.type, name_data.frequency):
270
- # if not places_data[places_data.code == code].empty:
271
- if code in self.places_data["code"].values:
272
- if country_pop_map is not None:
273
- population = country_pop_map[code]
274
- else:
275
- population = self.place_population(code)
276
- score = population * frequency
277
- score_dict[code] = score if not code in score_dict else score + score_dict[code]
278
- else:
279
- raise LookupError(f'{code} not in places data frame')
280
-
281
- return [(code, score) for code, score in sorted(score_dict.items(), key=lambda item: item[1], reverse=True)], score_dict
282
-
283
- def guess_scores(self, name, countries=None, country_pop_map=None, return_dict=False):
284
- if countries is not None and country_pop_map is not None:
285
- raise ValueError(f'At least one of countries and country_pop_map must be None')
286
- #collect scores dict for all name parts
287
- score_parts = []
288
- name = name.lower()
289
- for name_part in NameNationGuesser.advanced_splitter(name):
290
- _, score_part_dict = self.score_a_name_part(name_part, countries=countries, country_pop_map=country_pop_map)
291
-
292
- score_parts.append(score_part_dict)
293
- #identify all places in the scores
294
- all_places = set()
295
- for score_part in score_parts:
296
- all_places = all_places.union(set(score_part.keys()))
297
- parts = len(score_parts)
298
- #construct a scores dict with the score for each place
299
- scores_avg = {}
300
- for place in all_places:
301
- scores = []
302
- population = self.place_population(place) #TODO: should we use the population of country_pop_map if available?
303
- for score_part in score_parts:
304
- if place in score_part:
305
- scores.append(score_part[place])
306
- else:
307
- if self.algorithm == Algorithms.AVG:
308
- scores.append(0)
309
- elif self.algorithm == Algorithms.PROD:
310
- scores.append(self.min_frequency(place) * population)
311
- if self.algorithm == Algorithms.AVG:
312
- score = sum(scores) / len(scores)
313
- elif self.algorithm == Algorithms.PROD:
314
- score = np.prod([score/population for score in scores]) * population #each score part is already multiplied by population, this fixes that
315
- else:
316
- raise ValueError(f'Unknown algorithm: {self.algorithm}')
317
- scores_avg[place] = score
318
- retval = [(code, score) for code, score in sorted(scores_avg.items(), key=lambda item: item[1], reverse=True)]
319
- if return_dict:
320
- return retval, scores_avg
321
- else:
322
- return retval
323
-
324
- def guess(self, name, countries=None, country_pop_map=None):
325
- if countries is not None and country_pop_map is not None:
326
- raise ValueError(f'At least one of countries and country_pop_map must be None')
327
- scores = self.guess_scores(name, countries=countries, country_pop_map=country_pop_map)
328
- if len(scores) == 0:
329
- return None
330
- if len(scores) == 1 or self.guess_first_second_min_mag is None:
331
- place, _ = scores[0]
332
- return place
333
- else:
334
- place, score0 = scores[0]
335
- _, score1 = scores[1]
336
- if score0 >= score1 * self.guess_first_second_min_mag:
337
- return place
338
- else:
339
- return None
340
-
341
-
342
- def zone_scores_from_place_scores(self, score_list, return_dict=False):
343
- score_dict = {}
344
- for code, score in score_list:
345
- zone = self.get_zone_by_place(code)
346
- score_dict[zone] = score if zone not in score_dict else score + score_dict[zone]
347
- retval = [(zone, score) for zone, score in sorted(score_dict.items(), key=lambda item: item[1], reverse=True)]
348
-
349
- if return_dict:
350
- return retval, score_dict
351
- else:
352
- return retval
353
-
354
-
355
- def zone_scores(self, name, countries=None, country_pop_map=None, return_dict=False):
356
- if countries is not None and country_pop_map is not None:
357
- raise ValueError(f'At least one of countries and country_pop_map must be None')
358
- score_list = self.guess_scores(name, countries=countries, country_pop_map=country_pop_map)
359
- return self.zone_scores_from_place_scores(score_list, return_dict=return_dict)
360
-
361
- def guess_zone(self, name, countries=None, country_pop_map=None):
362
- scores = self.zone_scores(name, countries=countries, country_pop_map=country_pop_map)
363
- if len(scores) == 0:
364
- return None
365
-
366
- if len(scores) == 1 or self.guess_first_second_min_mag is None:
367
- place, _ = scores[0]
368
- return place
369
- else:
370
- place, score0 = scores[0]
371
- _, score1 = scores[1]
372
- if score0 >= score1 * self.guess_first_second_min_mag:
373
- return place
374
- else:
375
- return None
376
-
377
-
378
-
379
- def get_zone_by_place(self, code):
380
- if code in self.zone_by_place:
381
- return self.zone_by_place[code]
382
- places_data_code = self.places_data[self.places_data.code == code]
383
- zone = places_data_code.loc[places_data_code['population'].idxmax()][self.place_column_name]
384
- self.zone_by_place[code] = zone
385
-
386
- return zone