spotify_tensorflow/featran.py (59 lines of code) (raw):

# -*- coding: utf-8 -*- # # Copyright 2017-2019 Spotify AB. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # import json from collections import OrderedDict from os.path import join as pjoin from typing import Callable, List, Dict, Any, Union, Iterator # noqa: F401 import numpy as np # noqa: F401 import pandas as pd # noqa: F401 from tensorflow.python.lib.io import file_io class Featran(object): @classmethod def settings(cls, settings_dir, settings_filename=None): # type: (str, str) -> List[Dict[str, Any]] """ Read a Featran settings file and return a list of settings :param settings_dir: Path to the directory containing the settings file :param settings_filename: Filename of the Featran Settings JSON file :return: A List of Featran Settings """ f = cls.__get_featran_settings_file(settings_dir, settings_filename) with file_io.FileIO(f, "r") as fio: settings = json.load(fio) return settings @classmethod def names(cls, settings_path, feature_splitter_fn=None): # type: (str, Callable[[Any], str]) -> Union[List[str], Dict[str, List[str]]] """ Returns a list of Featran feature names. Optionally the list of names can be split into a dictionary keyed by the feature_splitter_fn :param settings_path: Path to the Featran Settings JSON Directory :param feature_splitter_fn: Function to split feature name into a keyed dictionary :return: A List or dictionary of Featran Feature names """ settings = cls.settings(settings_path) if feature_splitter_fn: return cls.__split_names(settings, feature_splitter_fn) else: return cls.__all_names(settings) @classmethod def reorder_numpy_dataset(cls, dataset, # type: Iterator[Dict[str, np.ndarray]] settings_path # type: str ): # type: (...) -> Iterator[OrderedDict[str, np.ndarray]] """ Reorders a numpy dictionary so that feature keys are in the same order as those in a Featran settings file. :param dataset: A dataset created via Datasets.dict :param settings_path: Path to the Featran Settings JSON Directory :return: An iterator over an OrderedDict mapping feature names to Numpy arrays """ feature_names = Featran.names(settings_path) for batch in dataset: yield OrderedDict((name, batch[name]) for name in feature_names) @classmethod def reorder_dataframe_dataset(cls, dataset, # type: Iterator[pd.DataFrame] settings_path # type: str ): # type: (...) -> Iterator[pd.DataFrame] """ Reorders a pandas DataFrame so that feature columns are in the same order as those in a Featran settings file. :param dataset: A dataset created via Datasets.dataframe :param settings_path: Path to the Featran Settings JSON Directory :return: An iterator over new DataFrame batches with ordered columns """ feature_names = Featran.names(settings_path) for batch in dataset: yield batch[feature_names] @staticmethod def __get_featran_settings_file(dir_path, settings_filename=None): # type: (str, str) -> str filename = settings_filename if settings_filename else "part-00000-of-00001.txt" filepath = pjoin(dir_path, filename) assert file_io.file_exists(filepath), "settings file `%s` does not exist" % filepath return filepath @staticmethod def __split_names(settings, feature_splitter_fn): # type: (List[Dict[str, Any]], Callable[[Any], str]) -> Dict[str, List[str]] from collections import defaultdict feature_names = defaultdict(list) # type: Dict[str, List[str]] for setting in settings: key = feature_splitter_fn(setting["name"]) for name in setting["featureNames"]: feature_names[key].append(name) return feature_names @staticmethod def __all_names(settings): # type: (List[Dict[str, Any]]) -> List[str] feature_names = [] for setting in settings: for name in setting["featureNames"]: feature_names.append(name) return feature_names