osci/preprocess/match_company/company_domain_matcher.py (37 lines of code) (raw):
"""Copyright since 2019, EPAM Systems
This file is part of OSCI.
OSCI is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OSCI is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OSCI. If not, see <http://www.gnu.org/licenses/>."""
from pathlib import Path
from typing import Optional
import yaml
import re
DEFAULT_MATCH_LIST_PATH = Path(__file__).parent.resolve() / 'company_domain_match_list.yaml'
class MetaSingleton(type):
"""Metaclass for create singleton"""
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(MetaSingleton, cls).__call__(*args, **kwargs)
return cls._instances[cls]
class CompanyDomainMatcher(metaclass=MetaSingleton):
def __init__(self, path=DEFAULT_MATCH_LIST_PATH):
self.path = path
self._domain2company = dict()
self._regex2company = dict()
self._load_file()
def _load_file(self):
with open(self.path) as f:
match_list = yaml.load(f, Loader=yaml.FullLoader) or []
for company in match_list:
name = company.get('company')
for domain in company.get('domains') or []:
self._domain2company[domain] = name
for regex in company.get('regex') or []:
self._regex2company[regex] = name
def match_company_by_domain(self, domain: str) -> Optional[str]:
company = self._domain2company.get(domain)
if company is not None:
return company
for regex in self._regex2company.keys():
if re.match(regex, domain):
return self._regex2company[regex]
return None
@classmethod
def tear_down(cls):
"""Delete all instances of the class"""
cls._instances = {}