def _get_feature_pipeline()

in sourcecode/scoring/pflip_model.py [0:0]
72 lines of code
8 McCabe index (conditional complexity)

  def _get_feature_pipeline(self) -> Pipeline:
    """Returns a scikit-learn pipeline for converting noteInfo into a feature matrix.

    The feature extraction pipeline applies different transformations to different
    columns.  The extraction pipeline applies different transformations to different
    columns within the noteInfo DataFrame.  In general:
    * User helpfulness and tag ratings are represented with a one-hot encoding.
    * Authorship is represented with a one-hot encoding.
    * Tag ratios are discretized using uniform width buckets, then one-hot encoded.
    * Aggregate summary statistics about note ratings are crossed and discretized, then
      one-hot encoded.

    Note that since the pipeline also includes feature selection, fitting the pipeline
    requires access to both the noteInfo DataFrame and labels.

    Returns:
      ColumnTransformer Pipeline composed of 7 constituent Pipelines, each handling
      different columns.
    """

    # Convert user helpfulness and tag rating directly into model features.  Only include
    # {user, rating} pairs where the pair occurs at least 5 times, and apply additional
    # filtering to the tags.
    rating_pipeline = Pipeline(
      [
        (
          "onehot",
          CountVectorizer(
            tokenizer=_identity, preprocessor=_identity, min_df=self._helpfulnessRaterMin
          ),
        )
      ]
    )
    helpful_tag_pipeline = Pipeline(
      [
        (
          "onehot",
          CountVectorizer(tokenizer=_identity, preprocessor=_identity, min_df=self._tagRaterMin),
        ),
        ("selection", SelectPercentile(chi2, percentile=self._helpfulTagPercentile)),
      ]
    )
    not_helpful_tag_pipeline = Pipeline(
      [
        (
          "onehot",
          CountVectorizer(tokenizer=_identity, preprocessor=_identity, min_df=self._tagRaterMin),
        ),
        ("selection", SelectPercentile(chi2, percentile=self._notHelpfulTagPercentile)),
      ]
    )
    # Convert authorship to a feature.  Note the featurization process is different from
    # ratings because there is exactly one author per note.
    author_pipeline = Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))])
    # Discretize tag ratios.
    tag_pipeline = Pipeline(
      [
        # During training there should never be a note that doesn't have any ratings in the
        # training set, but during prediction there could be notes without ratings, so we
        # impute 0.  It's unclear what impact this has on predictions, but it doesn't matter
        # because we are only using prediction values for notes that are being considered
        # for becoming CRH (and therefore have ratings, just like the training data).
        ("fill_nans_df", FunctionTransformer(_fill_nans)),
        ("drop_constants", VarianceThreshold()),
        (
          "binize",
          KBinsDiscretizer(n_bins=self._tagRatioBins, encode="onehot", strategy="quantile"),
        ),
      ]
    )
    # Log, cross and discretize rating counts of {helpful, somewhat helpful, not helpful}
    # x {left, center, right} for each note.
    summary_total_cross_first_pipeline = Pipeline(
      [
        ("log", FunctionTransformer(_feature_log)),
        ("cross", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
        (
          "binize",
          KBinsDiscretizer(n_bins=self._summaryBins, encode="onehot", strategy="quantile"),
        ),
      ]
    )
    # Discretize and cross stats about the rater factors (e.g. max positive factor
    # that rated helpful).
    stats_bin_first_pipeline = Pipeline(
      [
        ("fill_nans_df", FunctionTransformer(_fill_nans)),
        (
          "binize",
          KBinsDiscretizer(n_bins=self._helpfulStatsBins, encode="onehot", strategy="uniform"),
        ),
        ("cross", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
      ]
    )
    preprocess = ColumnTransformer(
      [
        ("ratings", rating_pipeline, _USER_HELPFULNESS_RATINGS),
        ("helpful_tags", helpful_tag_pipeline, _USER_HELPFUL_TAGS),
        ("not_helpful_tags", not_helpful_tag_pipeline, _USER_NOT_HELPFUL_TAGS),
        ("authors", author_pipeline, [c.noteAuthorParticipantIdKey]),
        ("tags", tag_pipeline, c.helpfulTagsTSVOrder + c.notHelpfulTagsTSVOrder),
        ("summary", summary_total_cross_first_pipeline, _BUCKET_COUNT_COLS),
        ("stats", stats_bin_first_pipeline, _STATS_COLS),
      ]
    )
    return preprocess