osci/preprocess/load_repositories.py (33 lines of code) (raw):
"""Copyright since 2021, EPAM Systems
This file is part of OSCI.
OSCI is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OSCI is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OSCI. If not, see <http://www.gnu.org/licenses/>."""
from osci.datalake.repositories import Repositories
from osci.datalake import DataLake
from osci.crawlers.github.rest import GithubRest
from osci.crawlers.github.repository.info import parse_get_repository_response
from osci.config import Config
from datetime import datetime
from typing import Iterable
import pandas as pd
import logging
log = logging.getLogger(__name__)
def _load_repositories(repos_names: Iterable[str]) -> pd.DataFrame:
def _repositories(names: Iterable[str]):
with GithubRest(token=Config().github_token) as rest:
for name in names:
try:
repo_resp = rest.get_repository(repo_name=name)
if repo_resp is not None:
repository = parse_get_repository_response(repo_resp, downloaded_at=datetime.now().date())
if repository.__getattribute__(Repositories.schema.license):
yield {field: repository.__getattribute__(field)
for field in Repositories.schema.required}
except Exception as ex:
log.error(f'Failed loading repository {name}. ex: {ex}')
return pd.DataFrame(_repositories(names=repos_names), columns=Repositories.schema.required)
def load_repositories(date: datetime) -> pd.DataFrame:
log.debug(f'Load repositories information for {date:%Y-%m-%d}')
repositories = Repositories(date=date)
df = pd.DataFrame(data=[], columns=Repositories.schema.required)
repositories_names = DataLake().landing.get_repositories(date=date)
if not repositories_names.empty:
df = _load_repositories(repos_names=repositories_names[DataLake().landing.schemas.repositories_names.name])
repositories.save(df)
return df