sourcecode/scoring/reputation_matrix_factorization/dataset.py (41 lines of code) (raw):

from dataclasses import dataclass from typing import Dict from .. import constants as c import numpy as np import pandas as pd import torch @dataclass class MatrixFactorizationDataset: # Tensors specifying the note, rater and target for each rating noteTensor: torch.Tensor raterTensor: torch.Tensor targetTensor: torch.Tensor # Ordered notes and raters associated with each index notes: np.ndarray # noteIds # idx -> id raters: np.ndarray # raterIds # idx -> id # Maps of id to index raterIdToIndex: Dict #: Dict[int, int] noteIdToIndex: Dict #: Dict[int, int] def build_dataset( ratings: pd.DataFrame, targets: np.ndarray, device: torch.device = torch.device("cpu"), ) -> MatrixFactorizationDataset: """Compose and return a MatrixFactorizationDataset given ratings and targets. Args: ratings: DF specifying notes and raters targets: numpy array specifying target values device: torch device where tensors should be stored (e.g. cuda, mps, cpu) """ # Identify mappings from note and rater IDs to indices notes = ratings[c.noteIdKey].drop_duplicates().sort_values().values noteIdToIndex = dict(zip(notes, np.arange(len(notes), dtype=np.int32))) raters = ratings[c.raterParticipantIdKey].drop_duplicates().sort_values().values raterIdToIndex = dict(zip(raters, np.arange(len(raters), dtype=np.int32))) # Generate tensors noteTensor = torch.IntTensor( [noteIdToIndex[noteId] for noteId in ratings[c.noteIdKey]], device=device ) raterTensor = torch.IntTensor( [raterIdToIndex[raterId] for raterId in ratings[c.raterParticipantIdKey]], device=device, ) targetTensor = torch.tensor(targets, device=device, dtype=torch.float32) # Return MatrixFactorizationDataset return MatrixFactorizationDataset( noteTensor=noteTensor, raterTensor=raterTensor, targetTensor=targetTensor, notes=notes, raters=raters, raterIdToIndex=raterIdToIndex, noteIdToIndex=noteIdToIndex, )