in sourcecode/scoring/scorer.py [0:0]
def prescore(self, scoringArgs: PrescoringArgs, preserveRatings: bool = True) -> ModelResult:
"""
Runs initial rounds of the matrix factorization scoring algorithm and returns intermediate
output that can be used to initialize and reduce the runtime of final scoring.
"""
torch.set_num_threads(self._threads)
logger.info(
f"prescore: Torch intra-op parallelism for {self.get_name()} set to: {torch.get_num_threads()}"
)
# Transform input, run core scoring algorithm, transform output.
with self.time_block("Filter input"):
ratings, noteStatusHistory = self._filter_input(
scoringArgs.noteTopics,
keep_columns(
scoringArgs.ratings,
[
c.noteIdKey,
c.raterParticipantIdKey,
c.helpfulNumKey,
c.helpfulnessLevelKey,
c.createdAtMillisKey,
]
+ c.notHelpfulTagsTSVOrder
+ c.helpfulTagsTSVOrder,
),
scoringArgs.noteStatusHistory,
scoringArgs.userEnrollment,
)
if not preserveRatings:
# Only remove ratings if we're running in parallel, since otherwise later scorers will
# need the ratings.
del scoringArgs.ratings
gc.collect()
# If there are no ratings left after filtering, then return empty dataframes.
if len(ratings) == 0:
return ModelResult(
pd.DataFrame(columns=self.get_internal_scored_notes_cols()),
(
pd.DataFrame(columns=self.get_internal_helpfulness_scores_cols())
if self.get_internal_helpfulness_scores_cols()
else None
),
(
pd.DataFrame(columns=self.get_auxiliary_note_info_cols())
if self.get_auxiliary_note_info_cols()
else None
),
self.get_name(),
None,
)
noteScores, userScores, metaScores = self._prescore_notes_and_users(
ratings, noteStatusHistory, scoringArgs.userEnrollment
)
# Returning should remove references to ratings, but manually trigger GC just to reclaim
# resources as soon as possible.
del ratings
gc.collect()
# Return dataframes with specified columns in specified order
# Reindex fills required columns with NaN if they aren't present in the original df.
return ModelResult(
scoredNotes=noteScores.reindex(
columns=c.prescoringNoteModelOutputTSVColumns, fill_value=np.nan
),
helpfulnessScores=userScores.reindex(
columns=c.prescoringRaterModelOutputTSVColumns, fill_value=np.nan
),
auxiliaryNoteInfo=noteScores.reindex(
columns=self.get_auxiliary_note_info_cols(), fill_value=np.nan
),
scorerName=self.get_name(),
metaScores=metaScores,
)