in sourcecode/scoring/pflip_model.py [0:0]
def _get_feature_pipeline(self) -> Pipeline:
"""Returns a scikit-learn pipeline for converting noteInfo into a feature matrix.
The feature extraction pipeline applies different transformations to different
columns. The extraction pipeline applies different transformations to different
columns within the noteInfo DataFrame. In general:
* User helpfulness and tag ratings are represented with a one-hot encoding.
* Authorship is represented with a one-hot encoding.
* Tag ratios are discretized using uniform width buckets, then one-hot encoded.
* Aggregate summary statistics about note ratings are crossed and discretized, then
one-hot encoded.
Note that since the pipeline also includes feature selection, fitting the pipeline
requires access to both the noteInfo DataFrame and labels.
Returns:
ColumnTransformer Pipeline composed of 7 constituent Pipelines, each handling
different columns.
"""
# Convert user helpfulness and tag rating directly into model features. Only include
# {user, rating} pairs where the pair occurs at least 5 times, and apply additional
# filtering to the tags.
rating_pipeline = Pipeline(
[
(
"onehot",
CountVectorizer(
tokenizer=_identity, preprocessor=_identity, min_df=self._helpfulnessRaterMin
),
)
]
)
helpful_tag_pipeline = Pipeline(
[
(
"onehot",
CountVectorizer(tokenizer=_identity, preprocessor=_identity, min_df=self._tagRaterMin),
),
("selection", SelectPercentile(chi2, percentile=self._helpfulTagPercentile)),
]
)
not_helpful_tag_pipeline = Pipeline(
[
(
"onehot",
CountVectorizer(tokenizer=_identity, preprocessor=_identity, min_df=self._tagRaterMin),
),
("selection", SelectPercentile(chi2, percentile=self._notHelpfulTagPercentile)),
]
)
# Convert authorship to a feature. Note the featurization process is different from
# ratings because there is exactly one author per note.
author_pipeline = Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))])
# Discretize tag ratios.
tag_pipeline = Pipeline(
[
# During training there should never be a note that doesn't have any ratings in the
# training set, but during prediction there could be notes without ratings, so we
# impute 0. It's unclear what impact this has on predictions, but it doesn't matter
# because we are only using prediction values for notes that are being considered
# for becoming CRH (and therefore have ratings, just like the training data).
("fill_nans_df", FunctionTransformer(_fill_nans)),
("drop_constants", VarianceThreshold()),
(
"binize",
KBinsDiscretizer(n_bins=self._tagRatioBins, encode="onehot", strategy="quantile"),
),
]
)
# Log, cross and discretize rating counts of {helpful, somewhat helpful, not helpful}
# x {left, center, right} for each note.
summary_total_cross_first_pipeline = Pipeline(
[
("log", FunctionTransformer(_feature_log)),
("cross", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
(
"binize",
KBinsDiscretizer(n_bins=self._summaryBins, encode="onehot", strategy="quantile"),
),
]
)
# Discretize and cross stats about the rater factors (e.g. max positive factor
# that rated helpful).
stats_bin_first_pipeline = Pipeline(
[
("fill_nans_df", FunctionTransformer(_fill_nans)),
(
"binize",
KBinsDiscretizer(n_bins=self._helpfulStatsBins, encode="onehot", strategy="uniform"),
),
("cross", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
]
)
preprocess = ColumnTransformer(
[
("ratings", rating_pipeline, _USER_HELPFULNESS_RATINGS),
("helpful_tags", helpful_tag_pipeline, _USER_HELPFUL_TAGS),
("not_helpful_tags", not_helpful_tag_pipeline, _USER_NOT_HELPFUL_TAGS),
("authors", author_pipeline, [c.noteAuthorParticipantIdKey]),
("tags", tag_pipeline, c.helpfulTagsTSVOrder + c.notHelpfulTagsTSVOrder),
("summary", summary_total_cross_first_pipeline, _BUCKET_COUNT_COLS),
("stats", stats_bin_first_pipeline, _STATS_COLS),
]
)
return preprocess