osci/transformers/rankers/commits_ranking.py (26 lines of code) (raw):
"""Copyright since 2020, EPAM Systems
This file is part of OSCI.
OSCI is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OSCI is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OSCI. If not, see <http://www.gnu.org/licenses/>."""
from pyspark.sql import DataFrame
from pyspark.sql import functions as f
def get_commits_ranking(df: DataFrame, commits_id_field: str, company_field: str,
result_field: str = 'Commits') -> DataFrame:
"""Get company amount of commits
:param df: PushEventsCommits
:param commits_id_field: Commit identifier (such as SHA)
:param company_field: Company name field
:param result_field: Field in output df which must contains amount of commits
:return:
"""
return df \
.select(f.col(commits_id_field), f.col(company_field)) \
.groupBy(company_field) \
.agg(f.count(commits_id_field).alias(result_field)) \
.sort(result_field, ascending=False)
def get_month_by_month_commits_amounts(df: DataFrame, commits_id_field: str, datetime_field: str,
result_month_field: str = 'Month',
result_field: str = 'Commits'):
"""Get month-by-month amount of commits
:param df: PushEventsCommits
:param commits_id_field: Commit identifier (such as SHA)
:param datetime_field: Event created at datetime field
:param result_month_field: Field in output df which must contains month
:param result_field: Field in output df which must contains amount of commits
:return:
"""
return df \
.select(f.col(commits_id_field), f.col(datetime_field)) \
.groupby(f.date_format(f.col(datetime_field), "yyyy-MM").alias(result_month_field)) \
.agg(f.count('*').alias(result_field)) \
.sort(result_month_field)
def get_employees_commits_amount(df: DataFrame,
author_name_field: str,
author_email_field: str,
commits_id_field: str,
result_field: str = 'Commits') -> DataFrame:
"""Get amount of employees' commits
:param df: PushEventsCommits
:param author_name_field: Commit author name field
:param author_email_field: Commit author email field
:param commits_id_field: Commit identifier (such as SHA)
:param result_field: Field in output df which must contains amount of commits
:return:
"""
return df.select(author_name_field, author_email_field, commits_id_field) \
.groupBy(author_name_field, author_email_field) \
.agg(f.count(f.col(commits_id_field)).alias(result_field)) \
.sort(result_field, ascending=False)