Spaces:

AliMustapha
/

Geo-GenderStudy

Runtime error

App Files Files Community

AliMustapha commited on Sep 12, 2023

Commit

43d233e

1 Parent(s): 1ffb898

rmv

Browse files

Files changed (1) hide show

dt.py +0 -386

dt.py DELETED Viewed

@@ -1,386 +0,0 @@
-# from Dictionary_guesser.name_nation_guesser import NameNationGuesser
-# import datetime
-# from GitScraping import CommitInfo
-# if __name__ == "__main__":
-#     guesser =NameNationGuesser(names_filename="Dictionary_guesser/names.csv",places_filename='Dictionary_guesser/places.tab', guess_first_second_min_mag=None,place_column_name="sub-region")
-#     commit_info=CommitInfo("https://github.com/AhmadM-DL/On-Learning-Implicit-Protected-Attributes")
-#     df,first_commit_dates = commit_info.get_first_commit_dates()
-#     def guess_zone(name, epoch, offset):
-#         dt = datetime.datetime.fromtimestamp(epoch)
-#         country_pop_map = guesser.country_pop_from_datetime(dt, offset)
-#         # print(country_pop_map)
-#         return guesser.guess_zone(name, country_pop_map=country_pop_map)
-#     first_commit_dates['Commit_Seconds'] = first_commit_dates['First_Commit_Date'].apply(lambda x: x.timestamp())
-#     first_commit_dates['Author_Timezone'] = first_commit_dates['Author_Timezone'] /60
-#     first_commit_dates['region_Dictionary'] = first_commit_dates.apply(lambda row: guess_zone(row['Author'],row['Commit_Seconds'], row['Author_Timezone']), axis=1)
-#     print(first_commit_dates)
-from google.cloud import storage
-import json
-import os
-import pandas as pd
-# # Initialize a client
-# jsonApi = os.getenv('apiKey')
-# print(jsonApi)
-# export jsonApi='{
-#     "type": "service_account",
-#     "project_id": "kinetic-guild-369323",
-#     "private_key_id": "b06a3ad76990da0e6970c072e95e7d26bb2e8c1d",
-#     "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDTahMlxsz8Zv1T\nY3+C4E5MS6SnP7ESeAKdhm6IHiELrvekmPsTkaZhf1UWpOuzj76iklWwocVpnDCw\nINn0BAg/ttBKA24PQzTinsw2C5gRtB4J36n5rLZ2wmLw8HkLXsm2z7w+4h0VHLCN\nIwp1+L/AS967nQ5qXzf+AR47RoKcMY7Ia2WzF8Cv+/rBsVbl2w+Mz7xTSnZl8neg\nIaL4QMguZWHR6W9Hc/lt1+ZSoDgZmqj+DjXd5NPwNfckPaX5nppz/kzmfuUkWI8U\njBv6KyqBKcmdr1aJMc8XH64y7BW6pmH7WEJAN/H/uiycj1b09Lr27BUwogbr/0Ae\n6exf0lnZAgMBAAECggEAL2foPj7LRUe0w0ea1paEiCAoHiauhoUplPgJffU/lLaZ\nqitxlWxCAjfCtS6q+ZsgdKTamR5VPX67/iqHpOtojBzqrMYDHmIEEFLqWK4V3dZl\nK/Ke0zESwyOIex15Dv8kvRzsya77NXo27pbuaBCssqpwmeI4UsriK89FX6ZKcEpV\n7xMJgOm9WA0OrPsO6GFVF5htvTh0QFuoq1kJDiQguOrez9qa+F52PXl4RArwCSeK\nbchQhHd6ASXCyRB+Bx39Vh62Xv6xJ6LiEsCC41gzH6jHAyFZZ/v2mFAlHnUg9qIN\nJKdQM5zXFA1dUB/l17k8BWu+Achanegd7gNxv1prrwKBgQD6AEgxYrVT5sChZjP2\nvzQIYNlx+rft8e/MsOjKzl+9ObS+1xuhTlekZnxhtRm8vAoD7XM4Rb4fDJWwR2vq\naBVvKt93Eg68gWsPOmqkdPkPLiW877VU4QLk07yEM6Nl/OqdlRF0EzKXHXmk4AVL\njSh0MLbc7McmoLVgre33m754IwKBgQDYfMKo4f8bev+91kXMTlTq/Kgpyyaqwcsq\nibyNYAtXdwknZW0iOhA9lQZADH9vG9QELspi0Zy2Uv1LLVk1cGJ6up8eVpWFzZ2S\nSgLZJ6WuqK6OcewqWFA/WQ3U94lKgdnWqT/rDSHgnw3kSmyEiQ+IL0zKa8IzuV0F\nRGqj4Ngn0wKBgQD1NMGabs6bdIELzUq6gd9vOE8O1HMDF4G0qvApuzF8T9VQOXwI\nQucDgOIOk6qiy2ynXYbdcsp/ecB4HhVi3KPpXYvBJhz+F5ICZbGjjHecxA6Pui2J\nCwnjlyoYIO3rYp5b4ZI033+HaImfhXqsF8/N5tn05uiOoqJEKVR2wHOZMQKBgBRy\nhDhLUDsaPPmDOYh4hZDEWGXKKFbMgxH7fHGl9qxGM/kinVI0RcBrSPHXvFmUOUxD\n1x3KSpD1+bKWD+z6NnL9GXZWGz1OFGnyz54PHpkGmaYeoH3HZZz2HlZVIwSEizy5\nM65RyTdcDoXXebRy9aKZRRmBYBBem6iZs7DS1de9AoGAFf/tR1HK4Cugh9vebzp0\nB5j7EJP1XESDKsGAOIFC7dereuDNHMDmRH72BMYSBvrfAY77mDzEpW1TGK9Qxch3\nvm1tKCdZTnYSMoq0nbc/QIFyn20StR6OD+0nS94NN8IpGM882D7fWITrhn4XrZe3\nrdE4C0JqAQ6BKL0ka4j93eQ=\n-----END PRIVATE KEY-----\n",
-#     "client_email": "[email protected]",
-#     "client_id": "102736498211031284416",
-#     "auth_uri": "https://accounts.google.com/o/oauth2/auth",
-#     "token_uri": "https://oauth2.googleapis.com/token",
-#     "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
-#     "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/geogenderali%40kinetic-guild-369323.iam.gserviceaccount.com",
-#     "universe_domain": "googleapis.com"
-#   }'
-#
-# os.environ['apiKey'] = json_string
-#
-jsonApi = os.getenv('jsonApi')
-bucket_name = os.getenv('bucket_name')
-file_name = os.getenv('file_name')
-print(file_name)
-print(bucket_name)
-print(jsonApi)
-service_account_info = json.loads(jsonApi)
-client = storage.Client.from_service_account_info(service_account_info)
-blob = client.get_bucket(bucket_name).blob(file_name)
-with blob.open("r") as file:
-    df = pd.read_csv(file,sep="\t")
-print(df.head())
-# Now df contains the data from the CSV file
-print(df.head())
-#!/usr/bin/env python3
-__copyright__ = "Copyright (C) 2022 Davide Rossi"
-__license__ = "GPL-3.0-or-later"
-import pandas as pd
-import numpy as np
-import re
-import random
-import pytz
-import datetime
-import time
-import regex
-import click
-import code
-import logging
-from enum import Enum
-import csv
-from unidecode import unidecode
-from collections import defaultdict
-class Algorithms(str,Enum):
-    AVG = 'avg'
-    PROD = 'prod'
-class NameNationGuesser:
-    WORD_RE = re.compile(r'\W+')
-    CAMEL_RE = re.compile(r'([A-Z][a-z]+)')
-    UPPER_RE = re.compile(r'([A-Z]+)')
-    UNDER_RE = re.compile(r'(_)')
-    LEADING_BLANKS_RE = re.compile(r'^\s*')
-    DEFAULT_PLACES_FILENAME = 'places.tab'
-    DEFAULT_NAMES_FILENAME = 'names_codes.tab'
-    DEFAULT_GUESS_FIRST_SECOND_MIN_MAG = None
-    DEFAULT_ALGORITHM = Algorithms.AVG
-    DEFAULT_COLUMN_NAME="zone"
-    def __init__(self, places_filename=DEFAULT_PLACES_FILENAME, names_filename=DEFAULT_NAMES_FILENAME, algorithm=DEFAULT_ALGORITHM, guess_first_second_min_mag=DEFAULT_GUESS_FIRST_SECOND_MIN_MAG,place_column_name=DEFAULT_COLUMN_NAME):
-        self.zone_by_place = {}
-        self.pop_by_place = {}
-        self.min_freq_by_place = {}
-        self.cumsum_population_by_code = {}
-        self.name_rows_by_name = defaultdict(list)
-        self.all_timezones = None
-        self.places_data = None
-        self.names_data = None
-        self.names_data_empty = None
-        self.names_data_col_names = ['name', 'type', 'code', 'frequency', 'gender']
-        self.names_data_dtype = {'frequency': float}
-        self.names_data_by_name = None
-        self.guess_first_second_min_mag = guess_first_second_min_mag
-        self.algorithm = algorithm
-        self.place_column_name=place_column_name
-        self.places_data = pd.read_csv(places_filename, sep='\t', header=0, keep_default_na=False, na_values='',
-            names=['country', 'state_name', 'region', 'un_subregion', 'zone', 'timezone', 'population', 'sovereignty_numeric', 'sovereignty', 'code', 'code3', 'code_num', 'cctdl',"_",'sub-region'],
-            dtype={'population':int, 'sovereignty_numeric':int, 'code_num':int})
-        self.names_data = self.__read_names_data(names_filename)
-        self.names_data_empty = pd.DataFrame().reindex(columns=self.names_data.columns)
-    @classmethod
-    def advanced_splitter(cls, seq):
-        """Split words separated by spaces or using CamelNotation"""
-        return cls.WORD_RE.split(cls.LEADING_BLANKS_RE.sub(r'', cls.CAMEL_RE.sub(r' \1', cls.UPPER_RE.sub(r' \1', cls.UNDER_RE.sub(r' ', seq)))))
-    def __get_cumsum_population(self, code):
-        if code not in self.cumsum_population_by_code:
-            df = self.places_data
-            #create a data frame for a specific code with an ordered cumsum population
-            df_code = df[(df['code'] == code) & (df['population'] != 0)][['timezone', 'population']]
-            population = df_code['population'].sum()
-            df_code = df_code.sort_values('population')
-            df_code['population'] = df_code['population'].cumsum()
-            self.cumsum_population_by_code[code] = (population, df_code.copy().reset_index(drop=True))
-        return self.cumsum_population_by_code[code]
-    def compatible_datetime_offset(self, code):
-        #extract a random timezone, with a chance proportional to the population of the people in that timezone
-        population, df_code = self.__get_cumsum_population(code)
-        timezone_name = df_code[df_code['population'] >= random.randrange(population)].iloc[0]['timezone']
-        valid_time = False
-        while not valid_time:
-            valid_time = True
-            #create a random datetime from 1/1/1970 to now
-            current_epoch = time.time()
-            epoch = random.uniform(0, current_epoch)
-            dt = datetime.datetime.fromtimestamp(epoch)
-            #localize the datetime using the timezone and calculate its UTC offset
-            timezone = pytz.timezone(timezone_name)
-            try: #it may not work because of an ambiguous or unexistent time for that timezone in that date
-                offset = int(timezone.utcoffset(dt).total_seconds()/60)
-            except:
-                valid_time = False
-        return dt, offset
-    def is_roman_language(self,text):
-        roman_pattern = r'^\p{Latin}+$'
-        match = regex.match(roman_pattern, text, flags=regex.UNICODE)
-        return match is not None
-    def text_to_romanize(self,text):
-        text=str(text)
-        translator = str.maketrans(r"-._\/+", "      ")
-        text= text.translate(translator)
-        if not self.is_roman_language(text):
-            return  unidecode(text)
-        else :
-            return text
-    def __read_names_data(self, names_filename):
-        self.names_data_by_name = defaultdict(list)
-        names = self.names_data_col_names
-        name_pos = names.index('name')
-        dtype = self.names_data_dtype
-        rows = []
-        with open(names_filename, "r") as file:
-            reader = csv.reader(file, delimiter='\t')
-            next(reader)
-            for row in reader:
-                name = row[name_pos].lower()
-                row[name_pos] = name
-                rows.append(row)
-                self.name_rows_by_name[name].append(row)
-        names_data = pd.DataFrame(rows)
-        names_data.columns = names
-        names_data = names_data.astype(dtype)
-        return names_data
-    def place_population(self, code):
-        if code not in self.pop_by_place:
-            self.pop_by_place[code] = self.places_data[self.places_data.code == code].population.sum()
-        return self.pop_by_place[code]
-    def min_frequency(self, code):
-        if code not in self.min_freq_by_place:
-    #        min_freq_dict[code] = names_data[names_data.code == code]['frequency'].min()
-#            self.min_freq_by_place[code] = self.names_data['frequency'].min()
-            self.min_freq_by_place[code] = self.names_data[(self.names_data['code'] == code) & (self.names_data['frequency'] > 0)]['frequency'].min()
-        return self.min_freq_by_place[code]
-    def name_data_for_name(self, name):
-        if name in self.names_data_by_name:
-            return self.names_data_by_name[name]
-        elif name in self.name_rows_by_name:
-            self.names_data_by_name[name] = pd.DataFrame(self.name_rows_by_name[name])
-            self.names_data_by_name[name].columns = self.names_data_col_names
-            self.names_data_by_name[name] = self.names_data_by_name[name].astype(self.names_data_dtype)
-            return self.names_data_by_name[name]
-        else:
-            return self.names_data_empty
-    #    return names_data_by_name[name] if name in names_data_by_name else names_data_empty #that is deadly slow, it's better to create a new data frame for each name
-    def get_all_timezones(self):
-        df = self.places_data
-        if self.all_timezones is None:
-            self.all_timezones = list(df[(df['timezone'].notnull()) & (df['population'] > 0)]['timezone'].unique())
-        return self.all_timezones
-    def country_pop_from_datetime(self, dt, offset):
-        df = self.places_data
-        places_pop = {}
-        for tz in self.get_all_timezones():
-            timezone = pytz.timezone(tz)
-            try:
-                timezone_offset = timezone.utcoffset(dt).total_seconds() // 60
-            except pytz.exceptions.AmbiguousTimeError:
-                timezone_offset = timezone.utcoffset(dt, is_dst=True).total_seconds() // 60
-            except pytz.exceptions.NonExistentTimeError:
-                timezone_offset = None
-            if timezone_offset == offset:
-                df_tz_pop = df[df['timezone'] == tz].iloc[0]
-                population = df_tz_pop['population']
-                code = df_tz_pop['code']
-                places_pop[code] = population
-        return places_pop
-    def score_a_name_part(self, name, countries=None, country_pop_map=None):
-        if countries is not None and country_pop_map is not None:
-            raise ValueError(f'At least one of countries and country_pop_map must be None')
-        name_data = self.name_data_for_name(name)
-        if country_pop_map is not None:
-            countries = list(country_pop_map.keys())
-        if countries is not None:
-            name_data = name_data[name_data.code.isin(countries)]
-        score_dict = {}
-        for code, _, frequency in zip(name_data.code, name_data.type, name_data.frequency):
-    #        if not places_data[places_data.code == code].empty:
-            if code in self.places_data["code"].values:
-                if country_pop_map is not None:
-                    population = country_pop_map[code]
-                else:
-                    population = self.place_population(code)
-                score = population * frequency
-                score_dict[code] = score if not code in score_dict else score + score_dict[code]
-            else:
-                raise LookupError(f'{code} not in places data frame')
-        return [(code, score) for code, score in sorted(score_dict.items(), key=lambda item: item[1], reverse=True)], score_dict
-    def guess_scores(self, name, countries=None, country_pop_map=None, return_dict=False):
-        if countries is not None and country_pop_map is not None:
-            raise ValueError(f'At least one of countries and country_pop_map must be None')
-        #collect scores dict for all name parts
-        score_parts = []
-        name = name.lower()
-        for name_part in NameNationGuesser.advanced_splitter(name):
-            _, score_part_dict = self.score_a_name_part(name_part, countries=countries, country_pop_map=country_pop_map)
-            score_parts.append(score_part_dict)
-        #identify all places in the scores
-        all_places = set()
-        for score_part in score_parts:
-            all_places = all_places.union(set(score_part.keys()))
-        parts = len(score_parts)
-        #construct a scores dict with the score for each place
-        scores_avg = {}
-        for place in all_places:
-            scores = []
-            population = self.place_population(place) #TODO: should we use the population of country_pop_map if available?
-            for score_part in score_parts:
-                if place in score_part:
-                    scores.append(score_part[place])
-                else:
-                    if self.algorithm == Algorithms.AVG:
-                        scores.append(0)
-                    elif self.algorithm == Algorithms.PROD:
-                        scores.append(self.min_frequency(place) * population)
-            if self.algorithm == Algorithms.AVG:
-                score = sum(scores) / len(scores)
-            elif self.algorithm == Algorithms.PROD:
-                score = np.prod([score/population for score in scores]) * population #each score part is already multiplied by population, this fixes that
-            else:
-                raise ValueError(f'Unknown algorithm: {self.algorithm}')
-            scores_avg[place] = score
-        retval = [(code, score) for code, score in sorted(scores_avg.items(), key=lambda item: item[1], reverse=True)]
-        if return_dict:
-            return retval, scores_avg
-        else:
-            return retval
-    def guess(self, name, countries=None, country_pop_map=None):
-        if countries is not None and country_pop_map is not None:
-            raise ValueError(f'At least one of countries and country_pop_map must be None')
-        scores = self.guess_scores(name, countries=countries, country_pop_map=country_pop_map)
-        if len(scores) == 0:
-            return None
-        if len(scores) == 1 or self.guess_first_second_min_mag is None:
-            place, _ = scores[0]
-            return place
-        else:
-            place, score0 = scores[0]
-            _, score1 = scores[1]
-            if score0 >= score1 * self.guess_first_second_min_mag:
-                return place
-            else:
-                return None
-    def zone_scores_from_place_scores(self, score_list, return_dict=False):
-        score_dict = {}
-        for code, score in score_list:
-            zone = self.get_zone_by_place(code)
-            score_dict[zone] = score if zone not in score_dict else score + score_dict[zone]
-        retval = [(zone, score) for zone, score in sorted(score_dict.items(), key=lambda item: item[1], reverse=True)]
-        if return_dict:
-            return retval, score_dict
-        else:
-            return retval
-    def zone_scores(self, name, countries=None, country_pop_map=None, return_dict=False):
-        if countries is not None and country_pop_map is not None:
-            raise ValueError(f'At least one of countries and country_pop_map must be None')
-        score_list = self.guess_scores(name, countries=countries, country_pop_map=country_pop_map)
-        return self.zone_scores_from_place_scores(score_list, return_dict=return_dict)
-    def guess_zone(self, name, countries=None, country_pop_map=None):
-        scores = self.zone_scores(name, countries=countries, country_pop_map=country_pop_map)
-        if len(scores) == 0:
-            return None
-        if len(scores) == 1 or self.guess_first_second_min_mag is None:
-            place, _ = scores[0]
-            return place
-        else:
-            place, score0 = scores[0]
-            _, score1 = scores[1]
-            if score0 >= score1 * self.guess_first_second_min_mag:
-                return place
-            else:
-                return None
-    def get_zone_by_place(self, code):
-        if code in self.zone_by_place:
-            return self.zone_by_place[code]
-        places_data_code = self.places_data[self.places_data.code == code]
-        zone = places_data_code.loc[places_data_code['population'].idxmax()][self.place_column_name]
-        self.zone_by_place[code] = zone
-        return zone