Geo-GenderStudy / GitScraping.py
AliMustapha's picture
some changing
5603359
raw
history blame
1.64 kB
__copyright__ = "Copyright (C) 2023 Ali Mustapha"
__license__ = "GPL-3.0-or-later"
import pandas as pd
from pydriller import Repository
import plotly.graph_objects as go
from get_gender import GenderPredictor
class CommitInfo:
def __init__(self, repo_url):
self.repo_url = repo_url
self.df = self.get_commit_info()
def get_commit_info(self):
commit_data = []
for commit in Repository(self.repo_url).traverse_commits():
commit_info = {
'Author': commit.author.name,
'Committer_Date': commit.committer_date,
'Author_Timezone': commit.author_timezone,
}
commit_data.append(commit_info)
df = pd.DataFrame(commit_data)
# Assuming you have your DataFrame named df
# Convert the "Committer_Date" column to pandas datetime with utc=True
df["Committer_Date"] = pd.to_datetime(df["Committer_Date"], utc=True)
# Extract the year from the "Committer_Date" column and create a new column "Year"
df["Year"] = df["Committer_Date"].dt.year
# Print the updated DataFrame
return df
def get_first_commit_dates(self):
# Group the DataFrame by 'Author' and find the minimum commit date and timezone for each author
first_commit_dates = self.df.groupby('Author').agg({
'Committer_Date': 'min',
'Author_Timezone': 'first'
}).reset_index()
# Rename the columns for clarity
first_commit_dates.columns = ['Author', 'First_Commit_Date', 'Author_Timezone']
return self.df,first_commit_dates