Spaces:

AliMustapha
/

Geo-GenderStudy

Runtime error

App Files Files Community

AliMustapha commited on Sep 12, 2023

Commit

1ffb898

1 Parent(s): 46236ad

code cleaning

Browse files

Files changed (4) hide show

Dictionary_guesser/dsutil.py +0 -2
Dictionary_guesser/name_maker.py +0 -2
Dictionary_guesser/name_nation_guesser.py +1 -2
dt.py +386 -0

Dictionary_guesser/dsutil.py CHANGED Viewed

@@ -1,7 +1,5 @@
 #!/usr/bin/env python3
-__copyright__ = "Copyright (C) 2022 Davide Rossi"
-__license__ = "GPL-3.0-or-later"
 import pandas as pd
 import numpy as np

 #!/usr/bin/env python3
 import pandas as pd
 import numpy as np

Dictionary_guesser/name_maker.py CHANGED Viewed

@@ -1,7 +1,5 @@
 #!/usr/bin/env python3
-__copyright__ = "Copyright (C) 2022 Davide Rossi"
-__license__ = "GPL-3.0-or-later"
 import pandas as pd
 import numpy as np

 #!/usr/bin/env python3
 import pandas as pd
 import numpy as np

Dictionary_guesser/name_nation_guesser.py CHANGED Viewed

@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
-__copyright__ = "Copyright (C) 2022 Davide Rossi"
-__license__ = "GPL-3.0-or-later"
 import pandas as pd
 import numpy as np

 #!/usr/bin/env python3
 import pandas as pd
 import numpy as np

dt.py ADDED Viewed

	@@ -0,0 +1,386 @@

+# from Dictionary_guesser.name_nation_guesser import NameNationGuesser
+# import datetime
+# from GitScraping import CommitInfo
+# if __name__ == "__main__":
+#     guesser =NameNationGuesser(names_filename="Dictionary_guesser/names.csv",places_filename='Dictionary_guesser/places.tab', guess_first_second_min_mag=None,place_column_name="sub-region")
+#     commit_info=CommitInfo("https://github.com/AhmadM-DL/On-Learning-Implicit-Protected-Attributes")
+#     df,first_commit_dates = commit_info.get_first_commit_dates()
+#     def guess_zone(name, epoch, offset):
+#         dt = datetime.datetime.fromtimestamp(epoch)
+#         country_pop_map = guesser.country_pop_from_datetime(dt, offset)
+#         # print(country_pop_map)
+#         return guesser.guess_zone(name, country_pop_map=country_pop_map)
+#     first_commit_dates['Commit_Seconds'] = first_commit_dates['First_Commit_Date'].apply(lambda x: x.timestamp())
+#     first_commit_dates['Author_Timezone'] = first_commit_dates['Author_Timezone'] /60
+#     first_commit_dates['region_Dictionary'] = first_commit_dates.apply(lambda row: guess_zone(row['Author'],row['Commit_Seconds'], row['Author_Timezone']), axis=1)
+#     print(first_commit_dates)
+from google.cloud import storage
+import json
+import os
+import pandas as pd
+# # Initialize a client
+# jsonApi = os.getenv('apiKey')
+# print(jsonApi)
+# export jsonApi='{
+#     "type": "service_account",
+#     "project_id": "kinetic-guild-369323",
+#     "private_key_id": "b06a3ad76990da0e6970c072e95e7d26bb2e8c1d",
+#     "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDTahMlxsz8Zv1T\nY3+C4E5MS6SnP7ESeAKdhm6IHiELrvekmPsTkaZhf1UWpOuzj76iklWwocVpnDCw\nINn0BAg/ttBKA24PQzTinsw2C5gRtB4J36n5rLZ2wmLw8HkLXsm2z7w+4h0VHLCN\nIwp1+L/AS967nQ5qXzf+AR47RoKcMY7Ia2WzF8Cv+/rBsVbl2w+Mz7xTSnZl8neg\nIaL4QMguZWHR6W9Hc/lt1+ZSoDgZmqj+DjXd5NPwNfckPaX5nppz/kzmfuUkWI8U\njBv6KyqBKcmdr1aJMc8XH64y7BW6pmH7WEJAN/H/uiycj1b09Lr27BUwogbr/0Ae\n6exf0lnZAgMBAAECggEAL2foPj7LRUe0w0ea1paEiCAoHiauhoUplPgJffU/lLaZ\nqitxlWxCAjfCtS6q+ZsgdKTamR5VPX67/iqHpOtojBzqrMYDHmIEEFLqWK4V3dZl\nK/Ke0zESwyOIex15Dv8kvRzsya77NXo27pbuaBCssqpwmeI4UsriK89FX6ZKcEpV\n7xMJgOm9WA0OrPsO6GFVF5htvTh0QFuoq1kJDiQguOrez9qa+F52PXl4RArwCSeK\nbchQhHd6ASXCyRB+Bx39Vh62Xv6xJ6LiEsCC41gzH6jHAyFZZ/v2mFAlHnUg9qIN\nJKdQM5zXFA1dUB/l17k8BWu+Achanegd7gNxv1prrwKBgQD6AEgxYrVT5sChZjP2\nvzQIYNlx+rft8e/MsOjKzl+9ObS+1xuhTlekZnxhtRm8vAoD7XM4Rb4fDJWwR2vq\naBVvKt93Eg68gWsPOmqkdPkPLiW877VU4QLk07yEM6Nl/OqdlRF0EzKXHXmk4AVL\njSh0MLbc7McmoLVgre33m754IwKBgQDYfMKo4f8bev+91kXMTlTq/Kgpyyaqwcsq\nibyNYAtXdwknZW0iOhA9lQZADH9vG9QELspi0Zy2Uv1LLVk1cGJ6up8eVpWFzZ2S\nSgLZJ6WuqK6OcewqWFA/WQ3U94lKgdnWqT/rDSHgnw3kSmyEiQ+IL0zKa8IzuV0F\nRGqj4Ngn0wKBgQD1NMGabs6bdIELzUq6gd9vOE8O1HMDF4G0qvApuzF8T9VQOXwI\nQucDgOIOk6qiy2ynXYbdcsp/ecB4HhVi3KPpXYvBJhz+F5ICZbGjjHecxA6Pui2J\nCwnjlyoYIO3rYp5b4ZI033+HaImfhXqsF8/N5tn05uiOoqJEKVR2wHOZMQKBgBRy\nhDhLUDsaPPmDOYh4hZDEWGXKKFbMgxH7fHGl9qxGM/kinVI0RcBrSPHXvFmUOUxD\n1x3KSpD1+bKWD+z6NnL9GXZWGz1OFGnyz54PHpkGmaYeoH3HZZz2HlZVIwSEizy5\nM65RyTdcDoXXebRy9aKZRRmBYBBem6iZs7DS1de9AoGAFf/tR1HK4Cugh9vebzp0\nB5j7EJP1XESDKsGAOIFC7dereuDNHMDmRH72BMYSBvrfAY77mDzEpW1TGK9Qxch3\nvm1tKCdZTnYSMoq0nbc/QIFyn20StR6OD+0nS94NN8IpGM882D7fWITrhn4XrZe3\nrdE4C0JqAQ6BKL0ka4j93eQ=\n-----END PRIVATE KEY-----\n",
+#     "client_email": "[email protected]",
+#     "client_id": "102736498211031284416",
+#     "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+#     "token_uri": "https://oauth2.googleapis.com/token",
+#     "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+#     "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/geogenderali%40kinetic-guild-369323.iam.gserviceaccount.com",
+#     "universe_domain": "googleapis.com"
+#   }'
+#
+# os.environ['apiKey'] = json_string
+#
+jsonApi = os.getenv('jsonApi')
+bucket_name = os.getenv('bucket_name')
+file_name = os.getenv('file_name')
+print(file_name)
+print(bucket_name)
+print(jsonApi)
+service_account_info = json.loads(jsonApi)
+client = storage.Client.from_service_account_info(service_account_info)
+blob = client.get_bucket(bucket_name).blob(file_name)
+with blob.open("r") as file:
+    df = pd.read_csv(file,sep="\t")
+print(df.head())
+# Now df contains the data from the CSV file
+print(df.head())
+#!/usr/bin/env python3
+__copyright__ = "Copyright (C) 2022 Davide Rossi"
+__license__ = "GPL-3.0-or-later"
+import pandas as pd
+import numpy as np
+import re
+import random
+import pytz
+import datetime
+import time
+import regex
+import click
+import code
+import logging
+from enum import Enum
+import csv
+from unidecode import unidecode
+from collections import defaultdict
+class Algorithms(str,Enum):
+    AVG = 'avg'
+    PROD = 'prod'
+class NameNationGuesser:
+    WORD_RE = re.compile(r'\W+')
+    CAMEL_RE = re.compile(r'([A-Z][a-z]+)')
+    UPPER_RE = re.compile(r'([A-Z]+)')
+    UNDER_RE = re.compile(r'(_)')
+    LEADING_BLANKS_RE = re.compile(r'^\s*')
+    DEFAULT_PLACES_FILENAME = 'places.tab'
+    DEFAULT_NAMES_FILENAME = 'names_codes.tab'
+    DEFAULT_GUESS_FIRST_SECOND_MIN_MAG = None
+    DEFAULT_ALGORITHM = Algorithms.AVG
+    DEFAULT_COLUMN_NAME="zone"
+    def __init__(self, places_filename=DEFAULT_PLACES_FILENAME, names_filename=DEFAULT_NAMES_FILENAME, algorithm=DEFAULT_ALGORITHM, guess_first_second_min_mag=DEFAULT_GUESS_FIRST_SECOND_MIN_MAG,place_column_name=DEFAULT_COLUMN_NAME):
+        self.zone_by_place = {}
+        self.pop_by_place = {}
+        self.min_freq_by_place = {}
+        self.cumsum_population_by_code = {}
+        self.name_rows_by_name = defaultdict(list)
+        self.all_timezones = None
+        self.places_data = None
+        self.names_data = None
+        self.names_data_empty = None
+        self.names_data_col_names = ['name', 'type', 'code', 'frequency', 'gender']
+        self.names_data_dtype = {'frequency': float}
+        self.names_data_by_name = None
+        self.guess_first_second_min_mag = guess_first_second_min_mag
+        self.algorithm = algorithm
+        self.place_column_name=place_column_name
+        self.places_data = pd.read_csv(places_filename, sep='\t', header=0, keep_default_na=False, na_values='',
+            names=['country', 'state_name', 'region', 'un_subregion', 'zone', 'timezone', 'population', 'sovereignty_numeric', 'sovereignty', 'code', 'code3', 'code_num', 'cctdl',"_",'sub-region'],
+            dtype={'population':int, 'sovereignty_numeric':int, 'code_num':int})
+        self.names_data = self.__read_names_data(names_filename)
+        self.names_data_empty = pd.DataFrame().reindex(columns=self.names_data.columns)
+    @classmethod
+    def advanced_splitter(cls, seq):
+        """Split words separated by spaces or using CamelNotation"""
+        return cls.WORD_RE.split(cls.LEADING_BLANKS_RE.sub(r'', cls.CAMEL_RE.sub(r' \1', cls.UPPER_RE.sub(r' \1', cls.UNDER_RE.sub(r' ', seq)))))
+    def __get_cumsum_population(self, code):
+        if code not in self.cumsum_population_by_code:
+            df = self.places_data
+            #create a data frame for a specific code with an ordered cumsum population
+            df_code = df[(df['code'] == code) & (df['population'] != 0)][['timezone', 'population']]
+            population = df_code['population'].sum()
+            df_code = df_code.sort_values('population')
+            df_code['population'] = df_code['population'].cumsum()
+            self.cumsum_population_by_code[code] = (population, df_code.copy().reset_index(drop=True))
+        return self.cumsum_population_by_code[code]
+    def compatible_datetime_offset(self, code):
+        #extract a random timezone, with a chance proportional to the population of the people in that timezone
+        population, df_code = self.__get_cumsum_population(code)
+        timezone_name = df_code[df_code['population'] >= random.randrange(population)].iloc[0]['timezone']
+        valid_time = False
+        while not valid_time:
+            valid_time = True
+            #create a random datetime from 1/1/1970 to now
+            current_epoch = time.time()
+            epoch = random.uniform(0, current_epoch)
+            dt = datetime.datetime.fromtimestamp(epoch)
+            #localize the datetime using the timezone and calculate its UTC offset
+            timezone = pytz.timezone(timezone_name)
+            try: #it may not work because of an ambiguous or unexistent time for that timezone in that date
+                offset = int(timezone.utcoffset(dt).total_seconds()/60)
+            except:
+                valid_time = False
+        return dt, offset
+    def is_roman_language(self,text):
+        roman_pattern = r'^\p{Latin}+$'
+        match = regex.match(roman_pattern, text, flags=regex.UNICODE)
+        return match is not None
+    def text_to_romanize(self,text):
+        text=str(text)
+        translator = str.maketrans(r"-._\/+", "      ")
+        text= text.translate(translator)
+        if not self.is_roman_language(text):
+            return  unidecode(text)
+        else :
+            return text
+    def __read_names_data(self, names_filename):
+        self.names_data_by_name = defaultdict(list)
+        names = self.names_data_col_names
+        name_pos = names.index('name')
+        dtype = self.names_data_dtype
+        rows = []
+        with open(names_filename, "r") as file:
+            reader = csv.reader(file, delimiter='\t')
+            next(reader)
+            for row in reader:
+                name = row[name_pos].lower()
+                row[name_pos] = name
+                rows.append(row)
+                self.name_rows_by_name[name].append(row)
+        names_data = pd.DataFrame(rows)
+        names_data.columns = names
+        names_data = names_data.astype(dtype)
+        return names_data
+    def place_population(self, code):
+        if code not in self.pop_by_place:
+            self.pop_by_place[code] = self.places_data[self.places_data.code == code].population.sum()
+        return self.pop_by_place[code]
+    def min_frequency(self, code):
+        if code not in self.min_freq_by_place:
+    #        min_freq_dict[code] = names_data[names_data.code == code]['frequency'].min()
+#            self.min_freq_by_place[code] = self.names_data['frequency'].min()
+            self.min_freq_by_place[code] = self.names_data[(self.names_data['code'] == code) & (self.names_data['frequency'] > 0)]['frequency'].min()
+        return self.min_freq_by_place[code]
+    def name_data_for_name(self, name):
+        if name in self.names_data_by_name:
+            return self.names_data_by_name[name]
+        elif name in self.name_rows_by_name:
+            self.names_data_by_name[name] = pd.DataFrame(self.name_rows_by_name[name])
+            self.names_data_by_name[name].columns = self.names_data_col_names
+            self.names_data_by_name[name] = self.names_data_by_name[name].astype(self.names_data_dtype)
+            return self.names_data_by_name[name]
+        else:
+            return self.names_data_empty
+    #    return names_data_by_name[name] if name in names_data_by_name else names_data_empty #that is deadly slow, it's better to create a new data frame for each name
+    def get_all_timezones(self):
+        df = self.places_data
+        if self.all_timezones is None:
+            self.all_timezones = list(df[(df['timezone'].notnull()) & (df['population'] > 0)]['timezone'].unique())
+        return self.all_timezones
+    def country_pop_from_datetime(self, dt, offset):
+        df = self.places_data
+        places_pop = {}
+        for tz in self.get_all_timezones():
+            timezone = pytz.timezone(tz)
+            try:
+                timezone_offset = timezone.utcoffset(dt).total_seconds() // 60
+            except pytz.exceptions.AmbiguousTimeError:
+                timezone_offset = timezone.utcoffset(dt, is_dst=True).total_seconds() // 60
+            except pytz.exceptions.NonExistentTimeError:
+                timezone_offset = None
+            if timezone_offset == offset:
+                df_tz_pop = df[df['timezone'] == tz].iloc[0]
+                population = df_tz_pop['population']
+                code = df_tz_pop['code']
+                places_pop[code] = population
+        return places_pop
+    def score_a_name_part(self, name, countries=None, country_pop_map=None):
+        if countries is not None and country_pop_map is not None:
+            raise ValueError(f'At least one of countries and country_pop_map must be None')
+        name_data = self.name_data_for_name(name)
+        if country_pop_map is not None:
+            countries = list(country_pop_map.keys())
+        if countries is not None:
+            name_data = name_data[name_data.code.isin(countries)]
+        score_dict = {}
+        for code, _, frequency in zip(name_data.code, name_data.type, name_data.frequency):
+    #        if not places_data[places_data.code == code].empty:
+            if code in self.places_data["code"].values:
+                if country_pop_map is not None:
+                    population = country_pop_map[code]
+                else:
+                    population = self.place_population(code)
+                score = population * frequency
+                score_dict[code] = score if not code in score_dict else score + score_dict[code]
+            else:
+                raise LookupError(f'{code} not in places data frame')
+        return [(code, score) for code, score in sorted(score_dict.items(), key=lambda item: item[1], reverse=True)], score_dict
+    def guess_scores(self, name, countries=None, country_pop_map=None, return_dict=False):
+        if countries is not None and country_pop_map is not None:
+            raise ValueError(f'At least one of countries and country_pop_map must be None')
+        #collect scores dict for all name parts
+        score_parts = []
+        name = name.lower()
+        for name_part in NameNationGuesser.advanced_splitter(name):
+            _, score_part_dict = self.score_a_name_part(name_part, countries=countries, country_pop_map=country_pop_map)
+            score_parts.append(score_part_dict)
+        #identify all places in the scores
+        all_places = set()
+        for score_part in score_parts:
+            all_places = all_places.union(set(score_part.keys()))
+        parts = len(score_parts)
+        #construct a scores dict with the score for each place
+        scores_avg = {}
+        for place in all_places:
+            scores = []
+            population = self.place_population(place) #TODO: should we use the population of country_pop_map if available?
+            for score_part in score_parts:
+                if place in score_part:
+                    scores.append(score_part[place])
+                else:
+                    if self.algorithm == Algorithms.AVG:
+                        scores.append(0)
+                    elif self.algorithm == Algorithms.PROD:
+                        scores.append(self.min_frequency(place) * population)
+            if self.algorithm == Algorithms.AVG:
+                score = sum(scores) / len(scores)
+            elif self.algorithm == Algorithms.PROD:
+                score = np.prod([score/population for score in scores]) * population #each score part is already multiplied by population, this fixes that
+            else:
+                raise ValueError(f'Unknown algorithm: {self.algorithm}')
+            scores_avg[place] = score
+        retval = [(code, score) for code, score in sorted(scores_avg.items(), key=lambda item: item[1], reverse=True)]
+        if return_dict:
+            return retval, scores_avg
+        else:
+            return retval
+    def guess(self, name, countries=None, country_pop_map=None):
+        if countries is not None and country_pop_map is not None:
+            raise ValueError(f'At least one of countries and country_pop_map must be None')
+        scores = self.guess_scores(name, countries=countries, country_pop_map=country_pop_map)
+        if len(scores) == 0:
+            return None
+        if len(scores) == 1 or self.guess_first_second_min_mag is None:
+            place, _ = scores[0]
+            return place
+        else:
+            place, score0 = scores[0]
+            _, score1 = scores[1]
+            if score0 >= score1 * self.guess_first_second_min_mag:
+                return place
+            else:
+                return None
+    def zone_scores_from_place_scores(self, score_list, return_dict=False):
+        score_dict = {}
+        for code, score in score_list:
+            zone = self.get_zone_by_place(code)
+            score_dict[zone] = score if zone not in score_dict else score + score_dict[zone]
+        retval = [(zone, score) for zone, score in sorted(score_dict.items(), key=lambda item: item[1], reverse=True)]
+        if return_dict:
+            return retval, score_dict
+        else:
+            return retval
+    def zone_scores(self, name, countries=None, country_pop_map=None, return_dict=False):
+        if countries is not None and country_pop_map is not None:
+            raise ValueError(f'At least one of countries and country_pop_map must be None')
+        score_list = self.guess_scores(name, countries=countries, country_pop_map=country_pop_map)
+        return self.zone_scores_from_place_scores(score_list, return_dict=return_dict)
+    def guess_zone(self, name, countries=None, country_pop_map=None):
+        scores = self.zone_scores(name, countries=countries, country_pop_map=country_pop_map)
+        if len(scores) == 0:
+            return None
+        if len(scores) == 1 or self.guess_first_second_min_mag is None:
+            place, _ = scores[0]
+            return place
+        else:
+            place, score0 = scores[0]
+            _, score1 = scores[1]
+            if score0 >= score1 * self.guess_first_second_min_mag:
+                return place
+            else:
+                return None
+    def get_zone_by_place(self, code):
+        if code in self.zone_by_place:
+            return self.zone_by_place[code]
+        places_data_code = self.places_data[self.places_data.code == code]
+        zone = places_data_code.loc[places_data_code['population'].idxmax()][self.place_column_name]
+        self.zone_by_place[code] = zone
+        return zone