diff --git a/conda-recipe/skll/meta.yaml b/conda-recipe/skll/meta.yaml index 47ea961f..fc832c55 100644 --- a/conda-recipe/skll/meta.yaml +++ b/conda-recipe/skll/meta.yaml @@ -37,6 +37,7 @@ requirements: - setuptools - beautifulsoup4 - joblib + - mord - numpy {{ numpy }} - pandas - ruamel.yaml @@ -49,6 +50,7 @@ requirements: - python - beautifulsoup4 - joblib + - mord - numpy - pandas - ruamel.yaml diff --git a/conda_requirements.txt b/conda_requirements.txt index 6fb3cf5d..9dd209e3 100644 --- a/conda_requirements.txt +++ b/conda_requirements.txt @@ -1,5 +1,6 @@ beautifulsoup4 joblib +mord numpy nose-cov pandas diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst index 7e016b9e..27877fa1 100644 --- a/doc/run_experiment.rst +++ b/doc/run_experiment.rst @@ -334,11 +334,13 @@ Regressors: * **GradientBoostingRegressor**: `Gradient Boosting Regressor `__ * **HuberRegressor**: `Huber Regression `__ * **KNeighborsRegressor**: `K-Nearest Neighbors Regression `__ + * **LAD**: `Least Absolute Deviation `__ * **Lars**: `Least Angle Regression `__ * **Lasso**: `Lasso Regression `__ * **LinearRegression**: `Linear Regression `__ * **LinearSVR**: `Support Vector Regression using LibLinear `__ * **MLPRegressor**: `Multi-layer Perceptron Regression `__ + * **OrdinalRidge**: `Ridge Regression with negative absolute error as score `__ * **RandomForestRegressor**: `Random Forest Regression `__ * **RANSACRegressor**: `RANdom SAmple Consensus Regression `__. Note that the default base estimator is a ``LinearRegression``. A different base regressor can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. * **Ridge**: `Ridge Regression `__ @@ -354,11 +356,17 @@ Regressors: Refer to this `example voting configuration file `__ to see how these parameters are used. - For all regressors *except* ``VotingRegressor``, you can also prepend - ``Rescaled`` to the beginning of the full name (e.g., ``RescaledSVR``) - to get a version of the regressor where predictions are rescaled and - constrained to better match the training set. Rescaled regressors - can, however, be used as underlying estimators for ``VotingRegressor`` + For all regressors *except* ``LAD``, ``OrdinalRidge``, and ``VotingRegressor``, + you can also prepend ``Rescaled`` to the beginning of the full name + (e.g., ``RescaledSVR``) to get a version of the regressor where predictions + are rescaled and constrained to better match the training set. + + ``Rescaled`` version of ``LAD`` and ``OrdinalRidge`` regressors are not available + because predictions of these models are already transformed in the range zero to + maximum of the labels, and rescaling of the predictions won't correlate to + the original. + + Rescaled regressors can, however, be used as underlying estimators for ``VotingRegressor`` learners. .. _featuresets: @@ -611,7 +619,7 @@ Lasso: {'random_state': 123456789} -LinearSVC and LinearSVR +LAD, LinearSVC and LinearSVR .. code-block:: python {'random_state': 123456789} @@ -638,7 +646,7 @@ RANSACRegressor {'loss': 'squared_loss', 'random_state': 123456789} -Ridge and RidgeClassifier +OrdinalRidge, Ridge and RidgeClassifier .. code-block:: python {'random_state': 123456789} diff --git a/requirements.txt b/requirements.txt index 3cf6143d..bcff0c56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ beautifulsoup4 joblib +mord numpy pandas ruamel.yaml diff --git a/skll/learner/__init__.py b/skll/learner/__init__.py index 9d0b2125..e7137c89 100644 --- a/skll/learner/__init__.py +++ b/skll/learner/__init__.py @@ -18,6 +18,7 @@ import joblib import numpy as np import scipy.sparse as sp +from mord import LAD, OrdinalRidge from sklearn.dummy import DummyClassifier, DummyRegressor # noqa: F401 from sklearn.ensemble import ( AdaBoostClassifier, @@ -274,16 +275,31 @@ def __init__(self, # noqa: C901 self._model_kwargs['multi_class'] = 'auto' if issubclass(self._model_type, - (AdaBoostClassifier, AdaBoostRegressor, - DecisionTreeClassifier, DecisionTreeRegressor, - DummyClassifier, ElasticNet, + (AdaBoostClassifier, + AdaBoostRegressor, + DecisionTreeClassifier, + DecisionTreeRegressor, + DummyClassifier, + ElasticNet, GradientBoostingClassifier, - GradientBoostingRegressor, Lasso, LinearSVC, - LinearSVR, LogisticRegression, MLPClassifier, - MLPRegressor, RandomForestClassifier, - RandomForestRegressor, RANSACRegressor, Ridge, - RidgeClassifier, SGDClassifier, SGDRegressor, - SVC, TheilSenRegressor)): + GradientBoostingRegressor, + LAD, + Lasso, + LinearSVC, + LinearSVR, + LogisticRegression, + MLPClassifier, + MLPRegressor, + OrdinalRidge, + RandomForestClassifier, + RandomForestRegressor, + RANSACRegressor, + Ridge, + RidgeClassifier, + SGDClassifier, + SGDRegressor, + SVC, + TheilSenRegressor)): self._model_kwargs['random_state'] = 123456789 if sampler_kwargs: @@ -612,7 +628,6 @@ def _create_estimator(self): if default_param_grid is None: raise ValueError(f"{self._model_type.__name__} is not a valid " "learner type.") - estimator = self._model_type(**self._model_kwargs) return estimator, default_param_grid diff --git a/tests/test_regression.py b/tests/test_regression.py index 20325e4d..e325b30e 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -131,6 +131,10 @@ def check_rescaling(name, grid_search=False): def test_rescaling(): + """test to make sure the rescaled model gives same performance as original""" + # we are not using LAD and OrdinalRidge because they + # does some clipping of the predictions, and thus + # making predictions different. for regressor_name in ['BayesianRidge', 'ElasticNet', 'HuberRegressor', @@ -236,7 +240,8 @@ def test_linear_models(): # the utility function to run the non-linear tests def check_non_linear_models(name, use_feature_hashing=False, - use_rescaling=False): + use_rescaling=False, + expected_corr=0.95): # create a FeatureSet object with the data we want to use if use_feature_hashing: @@ -269,7 +274,7 @@ def check_non_linear_models(name, # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) - assert_greater(cor, 0.95) + assert_greater(cor, expected_corr) # the runner function for non-linear regression models @@ -284,11 +289,23 @@ def test_non_linear_models(): yield (check_non_linear_models, regressor_name, use_feature_hashing, - use_rescaling) + use_rescaling, + 0.95) -# the utility function to run the tree-based regression tests +# the runner function for MORD regression models +def test_mord_models(): + for (regressor_name, + use_feature_hashing) in product(['OrdinalRidge', 'LAD'], + [False, True]): + yield (check_non_linear_models, + regressor_name, + use_feature_hashing, + False, + 0.86) + +# the utility function to run the tree-based regression tests def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): @@ -696,8 +713,8 @@ def test_invalid_regression_grid_objective(): for learner in ['AdaBoostRegressor', 'BayesianRidge', 'DecisionTreeRegressor', 'ElasticNet', 'GradientBoostingRegressor', 'HuberRegressor', - 'KNeighborsRegressor', 'Lars', 'Lasso', - 'LinearRegression', 'MLPRegressor', + 'KNeighborsRegressor', 'LAD', 'Lars', 'Lasso', + 'LinearRegression', 'MLPRegressor', 'OrdinalRidge' 'RandomForestRegressor', 'RANSACRegressor', 'Ridge', 'LinearSVR', 'SVR', 'SGDRegressor', 'TheilSenRegressor']: @@ -721,8 +738,8 @@ def test_invalid_regression_metric(): for learner in ['AdaBoostRegressor', 'BayesianRidge', 'DecisionTreeRegressor', 'ElasticNet', 'GradientBoostingRegressor', 'HuberRegressor', - 'KNeighborsRegressor', 'Lars', 'Lasso', - 'LinearRegression', 'MLPRegressor', + 'KNeighborsRegressor', 'LAD', 'Lars', 'Lasso', + 'LinearRegression', 'MLPRegressor', 'OrdinalRidge' 'RandomForestRegressor', 'RANSACRegressor', 'Ridge', 'LinearSVR', 'SVR', 'SGDRegressor', 'TheilSenRegressor']: