osci/preprocess/match_company/push_commits.py (29 lines of code) (raw):

"""Copyright since 2019, EPAM Systems This file is part of OSCI. OSCI is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OSCI is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OSCI. If not, see <http://www.gnu.org/licenses/>.""" from .match import match_company_by_email from typing import Iterable, Tuple import pandas as pd import logging log = logging.getLogger(__name__) def add_company_column_by_email(df: pd.DataFrame, email_field: str, company_field: str) -> pd.DataFrame: log.debug(f'Add company column {company_field} by field {email_field}') df[company_field] = df[email_field].apply(func=match_company_by_email) return df def filter_out_non_company_commits(df: pd.DataFrame, company_field: str) -> pd.DataFrame: log.debug('Remove `null` companies') return df[df[company_field].notnull()] def parse_dates(df: pd.DataFrame, datetime_field: str) -> pd.DataFrame: log.debug('Parse string column to datetime') if pd.api.types.is_datetime64_any_dtype(df[datetime_field]): log.warning(f'Column {datetime_field} is already type {df[datetime_field].dtype}. Nothing to do') return df if pd.api.types.is_string_dtype(df[datetime_field]): df[datetime_field] = pd.to_datetime(df[datetime_field]) else: log.warning(f'Cannot parse to datetime column {datetime_field} with type {df[datetime_field].dtype}') return df def process_push_commits(df: pd.DataFrame, email_field: str, company_field: str, datetime_field: str) -> Iterable[Tuple[str, pd.DataFrame]]: df = add_company_column_by_email(df, email_field=email_field, company_field=company_field) df = filter_out_non_company_commits(df, company_field=company_field) df = parse_dates(df, datetime_field=datetime_field) log.debug(f'Group df by company field: {company_field}') return df.groupby(company_field)