def prescore()

in sourcecode/scoring/scorer.py [0:0]


  def prescore(self, scoringArgs: PrescoringArgs, preserveRatings: bool = True) -> ModelResult:
    """
    Runs initial rounds of the matrix factorization scoring algorithm and returns intermediate
    output that can be used to initialize and reduce the runtime of final scoring.
    """
    torch.set_num_threads(self._threads)
    logger.info(
      f"prescore: Torch intra-op parallelism for {self.get_name()} set to: {torch.get_num_threads()}"
    )
    # Transform input, run core scoring algorithm, transform output.
    with self.time_block("Filter input"):
      ratings, noteStatusHistory = self._filter_input(
        scoringArgs.noteTopics,
        keep_columns(
          scoringArgs.ratings,
          [
            c.noteIdKey,
            c.raterParticipantIdKey,
            c.helpfulNumKey,
            c.helpfulnessLevelKey,
            c.createdAtMillisKey,
          ]
          + c.notHelpfulTagsTSVOrder
          + c.helpfulTagsTSVOrder,
        ),
        scoringArgs.noteStatusHistory,
        scoringArgs.userEnrollment,
      )
      if not preserveRatings:
        # Only remove ratings if we're running in parallel, since otherwise later scorers will
        # need the ratings.
        del scoringArgs.ratings
        gc.collect()

      # If there are no ratings left after filtering, then return empty dataframes.
      if len(ratings) == 0:
        return ModelResult(
          pd.DataFrame(columns=self.get_internal_scored_notes_cols()),
          (
            pd.DataFrame(columns=self.get_internal_helpfulness_scores_cols())
            if self.get_internal_helpfulness_scores_cols()
            else None
          ),
          (
            pd.DataFrame(columns=self.get_auxiliary_note_info_cols())
            if self.get_auxiliary_note_info_cols()
            else None
          ),
          self.get_name(),
          None,
        )

    noteScores, userScores, metaScores = self._prescore_notes_and_users(
      ratings, noteStatusHistory, scoringArgs.userEnrollment
    )

    # Returning should remove references to ratings, but manually trigger GC just to reclaim
    # resources as soon as possible.
    del ratings
    gc.collect()
    # Return dataframes with specified columns in specified order
    # Reindex fills required columns with NaN if they aren't present in the original df.
    return ModelResult(
      scoredNotes=noteScores.reindex(
        columns=c.prescoringNoteModelOutputTSVColumns, fill_value=np.nan
      ),
      helpfulnessScores=userScores.reindex(
        columns=c.prescoringRaterModelOutputTSVColumns, fill_value=np.nan
      ),
      auxiliaryNoteInfo=noteScores.reindex(
        columns=self.get_auxiliary_note_info_cols(), fill_value=np.nan
      ),
      scorerName=self.get_name(),
      metaScores=metaScores,
    )