From f8ddadf448207bf250e8d9ad84aedcd4db12e1db Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 23 Nov 2019 11:42:05 +0000 Subject: [PATCH 01/26] Unit Testing Production ML Code - Test Preprocessing --- .gitignore | 1 + .../gradient_boosting_model/requirements.txt | 3 ++ .../gradient_boosting_model/tests/conftest.py | 23 ++++++++++++ .../tests/test_pipeline.py | 37 +++++++++++++++++++ .../tests/test_preprocessors.py | 37 +++++++++++++++++++ packages/gradient_boosting_model/tox.ini | 27 ++++++++++++++ 6 files changed, 128 insertions(+) create mode 100644 packages/gradient_boosting_model/tests/conftest.py create mode 100644 packages/gradient_boosting_model/tests/test_pipeline.py create mode 100644 packages/gradient_boosting_model/tests/test_preprocessors.py create mode 100644 packages/gradient_boosting_model/tox.ini diff --git a/.gitignore b/.gitignore index 5346ee9..82d285c 100644 --- a/.gitignore +++ b/.gitignore @@ -89,6 +89,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.tox/ # Spyder project settings .spyderproject diff --git a/packages/gradient_boosting_model/requirements.txt b/packages/gradient_boosting_model/requirements.txt index 198c30e..9af412d 100644 --- a/packages/gradient_boosting_model/requirements.txt +++ b/packages/gradient_boosting_model/requirements.txt @@ -15,3 +15,6 @@ marshmallow>=3.2.2,<4.0 # packaging setuptools>=41.4.0,<42.0.0 wheel>=0.33.6,<0.34.0 + +# testing requirements +pytest>=5.3.2,<6.0.0 diff --git a/packages/gradient_boosting_model/tests/conftest.py b/packages/gradient_boosting_model/tests/conftest.py new file mode 100644 index 0000000..076de27 --- /dev/null +++ b/packages/gradient_boosting_model/tests/conftest.py @@ -0,0 +1,23 @@ +import pytest +from sklearn.model_selection import train_test_split + +from gradient_boosting_model.config.core import config +from gradient_boosting_model.processing.data_management import load_dataset + + +@pytest.fixture(scope="session") +def pipeline_inputs(): + # For larger datasets, here we would use a testing sub-sample. + data = load_dataset(file_name=config.app_config.training_data_file) + + # Divide train and test + X_train, X_test, y_train, y_test = train_test_split( + data[config.model_config.features], # predictors + data[config.model_config.target], + test_size=config.model_config.test_size, + # we are setting the random seed here + # for reproducibility + random_state=config.model_config.random_state, + ) + + return X_train, X_test, y_train, y_test diff --git a/packages/gradient_boosting_model/tests/test_pipeline.py b/packages/gradient_boosting_model/tests/test_pipeline.py new file mode 100644 index 0000000..e6ef088 --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_pipeline.py @@ -0,0 +1,37 @@ +from gradient_boosting_model import pipeline +from gradient_boosting_model.config.core import config + + +def test_pipeline_drops_unnecessary_features(pipeline_inputs): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + assert config.model_config.drop_features in X_train.columns + + # When + # We use the scikit-learn Pipeline private method `_fit` which is called + # by the `fit` method, since this allows us to access the transformed + # dataframe. For other models we could use the `transform` method, but + # the GradientBoostingRegressor does not have a `transform` method. + X_transformed, _ = pipeline.price_pipe._fit(X_train, y_train) + + # Then + assert config.model_config.drop_features in X_train.columns + assert config.model_config.drop_features not in X_transformed.columns + + +def test_pipeline_transforms_temporal_features(pipeline_inputs): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + + # When + # We use the scikit-learn Pipeline private method `_fit` which is called + # by the `fit` method, since this allows us to access the transformed + # dataframe. For other models we could use the `transform` method, but + # the GradientBoostingRegressor does not have a `transform` method. + X_transformed, _ = pipeline.price_pipe._fit(X_train, y_train) + + # Then + assert ( + X_transformed.iloc[0]["YearRemodAdd"] + == X_train.iloc[0]["YrSold"] - X_train.iloc[0]["YearRemodAdd"] + ) diff --git a/packages/gradient_boosting_model/tests/test_preprocessors.py b/packages/gradient_boosting_model/tests/test_preprocessors.py new file mode 100644 index 0000000..11a4900 --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_preprocessors.py @@ -0,0 +1,37 @@ +from gradient_boosting_model.config.core import config +from gradient_boosting_model.processing import preprocessors as pp + + +def test_drop_unnecessary_features_transformer(pipeline_inputs): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + assert config.model_config.drop_features in X_train.columns + + transformer = pp.DropUnecessaryFeatures( + variables_to_drop=config.model_config.drop_features, + ) + + # When + X_transformed = transformer.transform(X_train) + + # Then + assert config.model_config.drop_features not in X_transformed.columns + + +def test_temporal_variable_estimator(pipeline_inputs): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + + transformer = pp.TemporalVariableEstimator( + variables=config.model_config.temporal_vars, + reference_variable=config.model_config.drop_features, + ) + + # When + X_transformed = transformer.transform(X_train) + + # Then + assert ( + X_transformed.iloc[0]["YearRemodAdd"] + == X_train.iloc[0]["YrSold"] - X_train.iloc[0]["YearRemodAdd"] + ) diff --git a/packages/gradient_boosting_model/tox.ini b/packages/gradient_boosting_model/tox.ini new file mode 100644 index 0000000..218f99d --- /dev/null +++ b/packages/gradient_boosting_model/tox.ini @@ -0,0 +1,27 @@ +[tox] +envlist = unit_tests +skipsdist = True + + +[testenv] +install_command = pip install {opts} {packages} +deps = + -rrequirements.txt + +commands= + py.test + + +[testenv:unit_tests] +envdir = {toxworkdir}/unit_tests +deps = + {[testenv]deps} + +setenv = + PYTHONPATH=. + +commands = + pytest \ + -s \ + -vv \ + {posargs:tests/} From dcb0636279799b5b94ae6c3b25a27706cd4780c3 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 24 Nov 2019 14:28:13 +0000 Subject: [PATCH 02/26] Unit Testing Production ML Code - Test Config --- .../tests/test_config.py | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 packages/gradient_boosting_model/tests/test_config.py diff --git a/packages/gradient_boosting_model/tests/test_config.py b/packages/gradient_boosting_model/tests/test_config.py new file mode 100644 index 0000000..f7418a6 --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_config.py @@ -0,0 +1,124 @@ +from pathlib import Path + +from gradient_boosting_model.config.core import ( + create_and_validate_config, + fetch_config_from_yaml, +) + +import pytest +from pydantic import ValidationError + + +TEST_CONFIG_TEXT = """ +package_name: gradient_boosting_model +training_data_file: houseprice.csv +test_data_file: test.csv +drop_features: YrSold +pipeline_name: gb_regression +pipeline_save_file: gb_regression_output_v +target: SalePrice +variables_to_rename: + foo: bar +test_size: 0.1 +features: + - LotArea +numerical_vars: + - LotArea +categorical_vars: + - BsmtQual +temporal_vars: YearRemodAdd +numerical_vars_with_na: + - LotFrontage +numerical_na_not_allowed: + - LotArea +random_state: 0 +n_estimators: 50 +rare_label_tol: 0.01 +rare_label_n_categories: 5 +loss: ls +allowed_loss_functions: + - ls + - huber +""" + +INVALID_TEST_CONFIG_TEXT = """ +package_name: gradient_boosting_model +training_data_file: houseprice.csv +test_data_file: test.csv +drop_features: YrSold +pipeline_name: gb_regression +pipeline_save_file: gb_regression_output_v +target: SalePrice +features: + - LotArea +numerical_vars: + - LotArea +categorical_vars: + - BsmtQual +temporal_vars: YearRemodAdd +numerical_vars_with_na: + - LotFrontage +numerical_na_not_allowed: + - LotArea +random_state: 0 +n_estimators: 50 +rare_label_tol: 0.01 +rare_label_n_categories: 5 +loss: ls +allowed_loss_functions: + - huber +""" + + +def test_fetch_config_structure(tmpdir): + # Given + # We make use of the pytest built-in tmpdir fixture + configs_dir = Path(tmpdir) + config_1 = configs_dir / "sample_config.yml" + config_1.write_text(TEST_CONFIG_TEXT) + parsed_config = fetch_config_from_yaml(cfg_path=config_1) + + # When + config = create_and_validate_config(parsed_config=parsed_config) + + # Then + assert config.model_config + assert config.app_config + + +def test_config_validation_raises_error_for_invalid_config(tmpdir): + # Given + # We make use of the pytest built-in tmpdir fixture + configs_dir = Path(tmpdir) + config_1 = configs_dir / "sample_config.yml" + + # invalid config attempts to set a prohibited loss + # function which we validate against an allowed set of + # loss function parameters. + config_1.write_text(INVALID_TEST_CONFIG_TEXT) + parsed_config = fetch_config_from_yaml(cfg_path=config_1) + + # When + with pytest.raises(ValidationError) as excinfo: + create_and_validate_config(parsed_config=parsed_config) + + # Then + assert "not in the allowed set" in str(excinfo.value) + + +def test_missing_config_field_raises_validation_error(tmpdir): + # Given + # We make use of the pytest built-in tmpdir fixture + configs_dir = Path(tmpdir) + config_1 = configs_dir / "sample_config.yml" + TEST_CONFIG_TEXT = """package_name: gradient_boosting_model""" + config_1.write_text(TEST_CONFIG_TEXT) + parsed_config = fetch_config_from_yaml(cfg_path=config_1) + + # When + with pytest.raises(ValidationError) as excinfo: + create_and_validate_config(parsed_config=parsed_config) + + # Then + assert "field required" in str(excinfo.value) + assert "pipeline_name" in str(excinfo.value) From fd102c92ab591bc76041241c1a26bc05c221e122 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 24 Nov 2019 14:58:01 +0000 Subject: [PATCH 03/26] Unit Testing Production ML Code - Test Input Validation --- .../gradient_boosting_model/tests/conftest.py | 5 ++++ .../tests/test_pipeline.py | 17 +++++++++++ .../tests/test_validation.py | 30 +++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 packages/gradient_boosting_model/tests/test_validation.py diff --git a/packages/gradient_boosting_model/tests/conftest.py b/packages/gradient_boosting_model/tests/conftest.py index 076de27..a5f0cc4 100644 --- a/packages/gradient_boosting_model/tests/conftest.py +++ b/packages/gradient_boosting_model/tests/conftest.py @@ -21,3 +21,8 @@ def pipeline_inputs(): ) return X_train, X_test, y_train, y_test + + +@pytest.fixture() +def sample_input_data(): + return load_dataset(file_name=config.app_config.test_data_file) diff --git a/packages/gradient_boosting_model/tests/test_pipeline.py b/packages/gradient_boosting_model/tests/test_pipeline.py index e6ef088..3820995 100644 --- a/packages/gradient_boosting_model/tests/test_pipeline.py +++ b/packages/gradient_boosting_model/tests/test_pipeline.py @@ -1,5 +1,6 @@ from gradient_boosting_model import pipeline from gradient_boosting_model.config.core import config +from gradient_boosting_model.processing.validation import validate_inputs def test_pipeline_drops_unnecessary_features(pipeline_inputs): @@ -35,3 +36,19 @@ def test_pipeline_transforms_temporal_features(pipeline_inputs): X_transformed.iloc[0]["YearRemodAdd"] == X_train.iloc[0]["YrSold"] - X_train.iloc[0]["YearRemodAdd"] ) + + +def test_pipeline_predict_takes_validated_input(pipeline_inputs, sample_input_data): + # Given + X_train, X_test, y_train, y_test = pipeline_inputs + pipeline.price_pipe.fit(X_train, y_train) + + # When + validated_inputs, errors = validate_inputs(input_data=sample_input_data) + predictions = pipeline.price_pipe.predict( + validated_inputs[config.model_config.features] + ) + + # Then + assert predictions is not None + assert errors is None diff --git a/packages/gradient_boosting_model/tests/test_validation.py b/packages/gradient_boosting_model/tests/test_validation.py new file mode 100644 index 0000000..b636674 --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_validation.py @@ -0,0 +1,30 @@ +from gradient_boosting_model.processing.validation import validate_inputs + + +def test_validate_inputs(sample_input_data): + # When + validated_inputs, errors = validate_inputs(input_data=sample_input_data) + + # Then + assert not errors + + # we expect that 2 rows are removed due to missing vars + # 1459 is the total number of rows in the test data set (test.csv) + # and 1457 number returned after 2 rows are filtered out. + assert len(sample_input_data) == 1459 + assert len(validated_inputs) == 1457 + + +def test_validate_inputs_identifies_errors(sample_input_data): + # Given + test_inputs = sample_input_data.copy() + + # introduce errors + test_inputs.at[1, "BldgType"] = 50 # we expect a string + + # When + validated_inputs, errors = validate_inputs(input_data=test_inputs) + + # Then + assert errors + assert errors[1] == {"BldgType": ["Not a valid string."]} From 2ca89131c60c7e0085461949c84fc14358da73be Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 24 Nov 2019 18:07:35 +0000 Subject: [PATCH 04/26] Unit Testing Production ML Code - Test Model Quality --- .../gradient_boosting_model/requirements.txt | 4 ++ .../gradient_boosting_model/tests/conftest.py | 6 ++ .../tests/test_predict.py | 62 +++++++++++++++++++ packages/gradient_boosting_model/tox.ini | 1 + 4 files changed, 73 insertions(+) create mode 100644 packages/gradient_boosting_model/tests/test_predict.py diff --git a/packages/gradient_boosting_model/requirements.txt b/packages/gradient_boosting_model/requirements.txt index 9af412d..cbb1894 100644 --- a/packages/gradient_boosting_model/requirements.txt +++ b/packages/gradient_boosting_model/requirements.txt @@ -18,3 +18,7 @@ wheel>=0.33.6,<0.34.0 # testing requirements pytest>=5.3.2,<6.0.0 + +# old model for testing purposes +# source code: https://github.com/trainindata/deploying-machine-learning-models/tree/master/packages/regression_model +tid-regression-model>=2.0.20,<2.1.0 diff --git a/packages/gradient_boosting_model/tests/conftest.py b/packages/gradient_boosting_model/tests/conftest.py index a5f0cc4..c896d72 100644 --- a/packages/gradient_boosting_model/tests/conftest.py +++ b/packages/gradient_boosting_model/tests/conftest.py @@ -23,6 +23,12 @@ def pipeline_inputs(): return X_train, X_test, y_train, y_test +@pytest.fixture() +def raw_training_data(): + # For larger datasets, here we would use a testing sub-sample. + return load_dataset(file_name=config.app_config.training_data_file) + + @pytest.fixture() def sample_input_data(): return load_dataset(file_name=config.app_config.test_data_file) diff --git a/packages/gradient_boosting_model/tests/test_predict.py b/packages/gradient_boosting_model/tests/test_predict.py new file mode 100644 index 0000000..5be393e --- /dev/null +++ b/packages/gradient_boosting_model/tests/test_predict.py @@ -0,0 +1,62 @@ +from gradient_boosting_model.predict import make_prediction +from gradient_boosting_model.config.core import config + +from sklearn.metrics import mean_squared_error + +from regression_model.predict import make_prediction as alt_make_prediction + + +def test_prediction_quality_against_benchmark(raw_training_data, sample_input_data): + # Given + input_df = raw_training_data.drop(config.model_config.target, axis=1) + output_df = raw_training_data[config.model_config.target] + + # Generate rough benchmarks (you would tweak depending on your model) + benchmark_flexibility = 50000 + # setting ndigits to -4 will round the value to the nearest 10,000 i.e. 210,000 + benchmark_lower_boundary = ( + round(output_df.iloc[0], ndigits=-4) - benchmark_flexibility + ) # 210,000 - 50000 = 160000 + benchmark_upper_boundary = ( + round(output_df.iloc[0], ndigits=-4) + benchmark_flexibility + ) # 210000 + 50000 = 260000 + + # When + subject = make_prediction(input_data=input_df[0:1]) + + # Then + assert subject is not None + prediction = subject.get("predictions")[0] + assert isinstance(prediction, float) + assert prediction > benchmark_lower_boundary + assert prediction < benchmark_upper_boundary + + +def test_prediction_quality_against_another_model(raw_training_data, sample_input_data): + # Given + input_df = raw_training_data.drop(config.model_config.target, axis=1) + output_df = raw_training_data[config.model_config.target] + current_predictions = make_prediction(input_data=input_df) + + # the older model has these variable names reversed + input_df.rename( + columns={ + "FirstFlrSF": "1stFlrSF", + "SecondFlrSF": "2ndFlrSF", + "ThreeSsnPortch": "3SsnPorch", + }, + inplace=True, + ) + alternative_predictions = alt_make_prediction(input_data=input_df) + + # When + current_mse = mean_squared_error( + y_true=output_df.values, y_pred=current_predictions["predictions"] + ) + + alternative_mse = mean_squared_error( + y_true=output_df.values, y_pred=alternative_predictions["predictions"] + ) + + # Then + assert current_mse < alternative_mse diff --git a/packages/gradient_boosting_model/tox.ini b/packages/gradient_boosting_model/tox.ini index 218f99d..b6c5e1c 100644 --- a/packages/gradient_boosting_model/tox.ini +++ b/packages/gradient_boosting_model/tox.ini @@ -21,6 +21,7 @@ setenv = PYTHONPATH=. commands = + python gradient_boosting_model/train_pipeline.py pytest \ -s \ -vv \ From ebf76911f36bce6863fa2e7aeaa9409d944c2dde Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 24 Nov 2019 18:39:25 +0000 Subject: [PATCH 05/26] Unit Testing Production ML Code - Add Tooling --- packages/gradient_boosting_model/mypy.ini | 11 ++++++++ .../gradient_boosting_model/requirements.txt | 7 ----- .../test_requirements.txt | 13 +++++++++ .../tests/test_config.py | 2 +- packages/gradient_boosting_model/tox.ini | 27 +++++++++++++++++-- 5 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 packages/gradient_boosting_model/mypy.ini create mode 100644 packages/gradient_boosting_model/test_requirements.txt diff --git a/packages/gradient_boosting_model/mypy.ini b/packages/gradient_boosting_model/mypy.ini new file mode 100644 index 0000000..97e52a5 --- /dev/null +++ b/packages/gradient_boosting_model/mypy.ini @@ -0,0 +1,11 @@ +[mypy] +warn_unused_ignores = True +follow_imports = skip +show_error_context = True +warn_incomplete_stub = True +ignore_missing_imports = True +check_untyped_defs = True +cache_dir = /dev/null +warn_redundant_casts = True +warn_unused_configs = True +strict_optional = True diff --git a/packages/gradient_boosting_model/requirements.txt b/packages/gradient_boosting_model/requirements.txt index cbb1894..198c30e 100644 --- a/packages/gradient_boosting_model/requirements.txt +++ b/packages/gradient_boosting_model/requirements.txt @@ -15,10 +15,3 @@ marshmallow>=3.2.2,<4.0 # packaging setuptools>=41.4.0,<42.0.0 wheel>=0.33.6,<0.34.0 - -# testing requirements -pytest>=5.3.2,<6.0.0 - -# old model for testing purposes -# source code: https://github.com/trainindata/deploying-machine-learning-models/tree/master/packages/regression_model -tid-regression-model>=2.0.20,<2.1.0 diff --git a/packages/gradient_boosting_model/test_requirements.txt b/packages/gradient_boosting_model/test_requirements.txt new file mode 100644 index 0000000..993846e --- /dev/null +++ b/packages/gradient_boosting_model/test_requirements.txt @@ -0,0 +1,13 @@ +-r requirements.txt + +# testing requirements +pytest>=5.3.2,<6.0.0 + +# old model for testing purposes +# source code: https://github.com/trainindata/deploying-machine-learning-models/tree/master/packages/regression_model +tid-regression-model>=2.0.20,<2.1.0 + +# repo maintenance tooling +black>=19.10b0,<20.0 +flake8>=3.7.9,<4.0 +mypy>=0.740 diff --git a/packages/gradient_boosting_model/tests/test_config.py b/packages/gradient_boosting_model/tests/test_config.py index f7418a6..5a82241 100644 --- a/packages/gradient_boosting_model/tests/test_config.py +++ b/packages/gradient_boosting_model/tests/test_config.py @@ -17,7 +17,7 @@ pipeline_name: gb_regression pipeline_save_file: gb_regression_output_v target: SalePrice -variables_to_rename: +variables_to_rename: foo: bar test_size: 0.1 features: diff --git a/packages/gradient_boosting_model/tox.ini b/packages/gradient_boosting_model/tox.ini index b6c5e1c..e898072 100644 --- a/packages/gradient_boosting_model/tox.ini +++ b/packages/gradient_boosting_model/tox.ini @@ -1,12 +1,12 @@ [tox] -envlist = unit_tests +envlist = unit_tests,typechecks,stylechecks skipsdist = True [testenv] install_command = pip install {opts} {packages} deps = - -rrequirements.txt + -rtest_requirements.txt commands= py.test @@ -26,3 +26,26 @@ commands = -s \ -vv \ {posargs:tests/} + + +[testenv:typechecks] +envdir = {toxworkdir}/unit_tests + +deps = + {[testenv:unit_tests]deps} + +commands = {posargs:mypy gradient_boosting_model} + + +[testenv:stylechecks] +envdir = {toxworkdir}/unit_tests + +deps = + {[testenv:unit_tests]deps} + +commands = {posargs:flake8 gradient_boosting_model tests} + + +[flake8] +exclude = .git,env +max-line-length = 90 \ No newline at end of file From 8c6dd5921d75a4b8d479777a9278ce08752424cc Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 1 Dec 2019 15:27:17 +0000 Subject: [PATCH 06/26] Integration Testing Production ML Code - Initial Setup --- packages/ml_api/.dockerignore | 17 +++ packages/ml_api/Makefile | 9 ++ packages/ml_api/__init__.py | 0 packages/ml_api/api/__init__.py | 0 packages/ml_api/api/app.py | 23 ++++ packages/ml_api/api/config.py | 70 ++++++++++++ packages/ml_api/api/controller.py | 32 ++++++ packages/ml_api/api/spec/__init__.py | 0 packages/ml_api/api/spec/api.yaml | 127 ++++++++++++++++++++++ packages/ml_api/docker/Dockerfile | 18 +++ packages/ml_api/docker/docker-compose.yml | 9 ++ packages/ml_api/requirements.txt | 11 ++ packages/ml_api/run.py | 13 +++ 13 files changed, 329 insertions(+) create mode 100644 packages/ml_api/.dockerignore create mode 100644 packages/ml_api/Makefile create mode 100644 packages/ml_api/__init__.py create mode 100644 packages/ml_api/api/__init__.py create mode 100644 packages/ml_api/api/app.py create mode 100644 packages/ml_api/api/config.py create mode 100644 packages/ml_api/api/controller.py create mode 100644 packages/ml_api/api/spec/__init__.py create mode 100644 packages/ml_api/api/spec/api.yaml create mode 100644 packages/ml_api/docker/Dockerfile create mode 100644 packages/ml_api/docker/docker-compose.yml create mode 100644 packages/ml_api/requirements.txt create mode 100644 packages/ml_api/run.py diff --git a/packages/ml_api/.dockerignore b/packages/ml_api/.dockerignore new file mode 100644 index 0000000..8bd7171 --- /dev/null +++ b/packages/ml_api/.dockerignore @@ -0,0 +1,17 @@ +exercise_notebooks/* +*env* +*venv* +.circleci* +packages/gradient_boosting_model +*.env +*.log +.git +.gitignore +.dockerignore +*.mypy_cache +*.pytest_cache +*.tox + +# Byte-compiled / optimized / DLL files +*__pycache__* +*.py[cod] \ No newline at end of file diff --git a/packages/ml_api/Makefile b/packages/ml_api/Makefile new file mode 100644 index 0000000..bca844b --- /dev/null +++ b/packages/ml_api/Makefile @@ -0,0 +1,9 @@ +# For details on Makefiles, see the section notes. + +# Specify phony list to ensure make recipes do not conflict with real file names +.PHONY: run-service-development + +# start up Flask API service +run-service-development: + @echo "+ $@" + python run.py diff --git a/packages/ml_api/__init__.py b/packages/ml_api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/api/__init__.py b/packages/ml_api/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/api/app.py b/packages/ml_api/api/app.py new file mode 100644 index 0000000..81848d4 --- /dev/null +++ b/packages/ml_api/api/app.py @@ -0,0 +1,23 @@ +import logging + +import connexion + +from api.config import Config + + +_logger = logging.getLogger(__name__) + + +def create_app(*, config_object: Config) -> connexion.App: + """Create app instance.""" + + connexion_app = connexion.App( + __name__, debug=config_object.DEBUG, specification_dir="spec/" + ) + flask_app = connexion_app.app + flask_app.config.from_object(config_object) + connexion_app.add_api("api.yaml") + + _logger.info("Application instance created") + + return connexion_app diff --git a/packages/ml_api/api/config.py b/packages/ml_api/api/config.py new file mode 100644 index 0000000..6bb032a --- /dev/null +++ b/packages/ml_api/api/config.py @@ -0,0 +1,70 @@ +import logging +import os +import pathlib +import sys + +import api + + +# logging format +FORMATTER = logging.Formatter( + "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s" +) + +# Project Directories +ROOT = pathlib.Path(api.__file__).resolve().parent.parent + + +class Config: + DEBUG = False + TESTING = False + ENV = os.getenv("FLASK_ENV", "production") + SERVER_PORT = int(os.getenv("SERVER_PORT", 5000)) + SERVER_HOST = os.getenv("SERVER_HOST", "0.0.0.0") + LOGGING_LEVEL = os.getenv("LOGGING_LEVEL", logging.INFO) + + +class DevelopmentConfig(Config): + DEBUG = True + ENV = "development" # do not use in production! + LOGGING_LEVEL = logging.DEBUG + + +class TestingConfig(Config): + DEBUG = True + TESTING = True + LOGGING_LEVEL = logging.DEBUG + + +class ProductionConfig(Config): + pass + + +def get_console_handler(): + """Setup console logging handler.""" + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(FORMATTER) + return console_handler + + +def setup_app_logging(config: Config) -> None: + """Prepare custom logging for our application.""" + _disable_irrelevant_loggers() + root = logging.getLogger() + root.setLevel(config.LOGGING_LEVEL) + root.addHandler(get_console_handler()) + root.propagate = False + + +def _disable_irrelevant_loggers() -> None: + """Disable loggers created by packages which create a lot of noise.""" + for logger_name in ( + "connexion.apis.flask_api", + "connexion.apis.abstract", + "connexion.decorators", + "connexion.operation", + "connexion.operations", + "connexion.app", + "openapi_spec_validator", + ): + logging.getLogger(logger_name).level = logging.WARNING diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py new file mode 100644 index 0000000..5e425c6 --- /dev/null +++ b/packages/ml_api/api/controller.py @@ -0,0 +1,32 @@ +from flask import request, jsonify, Response +from gradient_boosting_model.predict import make_prediction + +import json + + +def health(): + if request.method == "GET": + return jsonify({"status": "ok"}) + + +def predict(): + if request.method == "POST": + # Step 1: Extract POST data from request body as JSON + json_data = request.get_json() + + # Step 2: Access the model prediction function (also validates data) + result = make_prediction(input_data=json_data) + + # Step 3: Handle errors + errors = result.get("errors") + if errors: + return Response(json.dumps(errors), status=400) + + # Step 4: Split out results + predictions = result.get("predictions").tolist() + version = result.get("version") + + # Step 5: Prepare prediction response + return jsonify( + {"predictions": predictions, "version": version, "errors": errors} + ) diff --git a/packages/ml_api/api/spec/__init__.py b/packages/ml_api/api/spec/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/api/spec/api.yaml b/packages/ml_api/api/spec/api.yaml new file mode 100644 index 0000000..cd33cf5 --- /dev/null +++ b/packages/ml_api/api/spec/api.yaml @@ -0,0 +1,127 @@ +openapi: 3.0.0 + +info: + title: Spec for House Price Prediction API + version: '1' + +servers: +- url: http://{base}:5000/ + description: API for performing house price predictions. + variables: + base: + default: 0.0.0.0 + +paths: + /: + get: + operationId: api.controller.health + responses: + '200': + description: API Health Status + + /v1/predictions: + post: + operationId: api.controller.predict + requestBody: + description: House details used to make price prediction + required: true + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/HouseDetails' + responses: + '200': + description: House Price Predictions + '400': + description: Bad request, house data validation failed + '5XX': + description: Unexpected error + +components: + schemas: + HouseDetails: + type: object + description: "List of the houses to get predictions for." + example: + Id: 1461 + MSSubClass: 20 + MSZoning: RH + LotFrontage: 80.0 + LotArea: 11622 + Street: Pave + Alley: null + LotShape: Reg + LandContour: Lvl + Utilities: AllPub + LotConfig: Inside + LandSlope: Gtl + Neighborhood: NAmes + Condition1: Feedr + Condition2: Norm + BldgType: 1Fam + HouseStyle: 1Story + OverallQual: 5 + OverallCond: 6 + YearBuilt: 1961 + YearRemodAdd: 1961 + RoofStyle: Gable + RoofMatl: CompShg + Exterior1st: VinylSd + Exterior2nd: VinylSd + MasVnrType: None + MasVnrArea: 0.0 + ExterQual: TA + ExterCond: TA + Foundation: CBlock + BsmtQual: TA + BsmtCond: TA + BsmtExposure: null + BsmtFinType1: Rec + BsmtFinSF1: 468.0 + BsmtFinType2: LwQ + BsmtFinSF2: 144.0 + BsmtUnfSF: 270.0 + TotalBsmtSF: 882.0 + Heating: GasA + HeatingQC: TA + CentralAir: Y + Electrical: SBrkr + 1stFlrSF: 896 + 2ndFlrSF: 0 + LowQualFinSF: 0 + GrLivArea: 896 + BsmtFullBath: 0.0 + BsmtHalfBath: 0.0 + FullBath: 1 + HalfBath: 0 + BedroomAbvGr: 2 + KitchenAbvGr: 1 + KitchenQual: TA + TotRmsAbvGrd: 5 + Functional: Typ + Fireplaces: 0 + FireplaceQu: null + GarageType: Attchd + GarageYrBlt: 1961.0 + GarageFinish: Unf + GarageCars: 1.0 + GarageArea: 730.0 + GarageQual: TA + GarageCond: TA + PavedDrive: Y + WoodDeckSF: 140 + OpenPorchSF: 0 + EnclosedPorch: 0 + 3SsnPorch: 0 + ScreenPorch: 120 + PoolArea: 0 + PoolQC: null + Fence: MnPrv + MiscFeature: null + MiscVal: 0 + MoSold: 6 + YrSold: 2010 + SaleType: WD + SaleCondition: Normal diff --git a/packages/ml_api/docker/Dockerfile b/packages/ml_api/docker/Dockerfile new file mode 100644 index 0000000..8fad752 --- /dev/null +++ b/packages/ml_api/docker/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.7.5-slim-buster + +RUN mkdir -p /opt/app +COPY requirements.txt /opt/app/requirements.txt +RUN pip install --upgrade pip + +# ensure we can run the make commands +RUN apt-get update -y && \ + apt-get install -y make && \ + apt-get install -y libffi-dev gcc && \ + # for swagger + apt-get install -y curl + +RUN pip install -r /opt/app/requirements.txt +COPY Makefile /opt/app/Makefile +COPY api /opt/app/api +COPY run.py /opt/app/run.py +WORKDIR /opt/app diff --git a/packages/ml_api/docker/docker-compose.yml b/packages/ml_api/docker/docker-compose.yml new file mode 100644 index 0000000..efe4b10 --- /dev/null +++ b/packages/ml_api/docker/docker-compose.yml @@ -0,0 +1,9 @@ +version: '3' +services: + ml_api: + build: + context: ../ + dockerfile: docker/Dockerfile + ports: + - "5000:5000" # expose webserver to localhost host:container + command: bash -c "make run-service-development" diff --git a/packages/ml_api/requirements.txt b/packages/ml_api/requirements.txt new file mode 100644 index 0000000..47d3df0 --- /dev/null +++ b/packages/ml_api/requirements.txt @@ -0,0 +1,11 @@ +# ML Model +tid-gradient-boosting-model>=0.1.18,<0.2.0 + +# Web microframework for the API +flask>=1.1.1,<1.2.0 +connexion[swagger-ui]>=2.5.1,<2.6.0 + +# repo maintenance tooling +black>=19.10b0,<20.0 +flake8>=3.7.9,<4.0 +mypy>=0.740 \ No newline at end of file diff --git a/packages/ml_api/run.py b/packages/ml_api/run.py new file mode 100644 index 0000000..898a50c --- /dev/null +++ b/packages/ml_api/run.py @@ -0,0 +1,13 @@ +from api.app import create_app +from api.config import DevelopmentConfig, setup_app_logging + + +_config = DevelopmentConfig() + +# setup logging as early as possible +setup_app_logging(config=_config) +application = create_app(config_object=_config).app + + +if __name__ == "__main__": + application.run(port=_config.SERVER_PORT, host=_config.SERVER_HOST) From 031ac57040764ff42be09643a0fb1816ce167309 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 1 Dec 2019 18:42:17 +0000 Subject: [PATCH 07/26] Integration Testing Production ML Code - Create Integration Tests --- packages/ml_api/docker/Dockerfile | 4 +- .../workaround_32_os/Dockerfile.workaround | 20 +++++ .../docker-compose-workaround.yml | 13 +++ packages/ml_api/mypy.ini | 11 +++ .../{ => requirements}/requirements.txt | 2 +- .../ml_api/requirements/test_requirements.txt | 9 ++ packages/ml_api/tests/__init__.py | 0 packages/ml_api/tests/conftest.py | 17 ++++ packages/ml_api/tests/test_api.py | 84 +++++++++++++++++++ packages/ml_api/tox.ini | 63 ++++++++++++++ 10 files changed, 220 insertions(+), 3 deletions(-) create mode 100644 packages/ml_api/docker/workaround_32_os/Dockerfile.workaround create mode 100644 packages/ml_api/docker/workaround_32_os/docker-compose-workaround.yml create mode 100644 packages/ml_api/mypy.ini rename packages/ml_api/{ => requirements}/requirements.txt (94%) create mode 100644 packages/ml_api/requirements/test_requirements.txt create mode 100644 packages/ml_api/tests/__init__.py create mode 100644 packages/ml_api/tests/conftest.py create mode 100644 packages/ml_api/tests/test_api.py create mode 100644 packages/ml_api/tox.ini diff --git a/packages/ml_api/docker/Dockerfile b/packages/ml_api/docker/Dockerfile index 8fad752..f374941 100644 --- a/packages/ml_api/docker/Dockerfile +++ b/packages/ml_api/docker/Dockerfile @@ -1,7 +1,7 @@ FROM python:3.7.5-slim-buster RUN mkdir -p /opt/app -COPY requirements.txt /opt/app/requirements.txt +COPY requirements /opt/app/requirements RUN pip install --upgrade pip # ensure we can run the make commands @@ -11,7 +11,7 @@ RUN apt-get update -y && \ # for swagger apt-get install -y curl -RUN pip install -r /opt/app/requirements.txt +RUN pip install -r /opt/app/requirements/requirements.txt COPY Makefile /opt/app/Makefile COPY api /opt/app/api COPY run.py /opt/app/run.py diff --git a/packages/ml_api/docker/workaround_32_os/Dockerfile.workaround b/packages/ml_api/docker/workaround_32_os/Dockerfile.workaround new file mode 100644 index 0000000..ca8c7e9 --- /dev/null +++ b/packages/ml_api/docker/workaround_32_os/Dockerfile.workaround @@ -0,0 +1,20 @@ +FROM python:3.7.5-slim-buster + +RUN mkdir -p /opt/app +COPY requirements /opt/app/requirements +RUN pip install --upgrade pip +RUN pip install tox + +# ensure we can run the make commands +RUN apt-get update -y && \ + apt-get install -y make && \ + apt-get install -y libffi-dev gcc && \ + # for swagger + apt-get install -y curl + +RUN pip install -r /opt/app/requirements/test_requirements.txt +COPY tests /opt/app/tests +COPY tox.ini /opt/app/tox.ini +COPY api /opt/app/api +COPY run.py /opt/app/run.py +WORKDIR /opt/app diff --git a/packages/ml_api/docker/workaround_32_os/docker-compose-workaround.yml b/packages/ml_api/docker/workaround_32_os/docker-compose-workaround.yml new file mode 100644 index 0000000..171d4e2 --- /dev/null +++ b/packages/ml_api/docker/workaround_32_os/docker-compose-workaround.yml @@ -0,0 +1,13 @@ +# This is only to be used as a workaround for students who +# are unable to install the gradient_boosting_model package +# because they are on a 32 bit operating system + +version: '3' +services: + ml_api: + build: + context: ../../ + dockerfile: docker/workaround_32_os/Dockerfile.workaround + ports: + - "5000:5000" + command: bash -c "tox -e integration_tests" diff --git a/packages/ml_api/mypy.ini b/packages/ml_api/mypy.ini new file mode 100644 index 0000000..97e52a5 --- /dev/null +++ b/packages/ml_api/mypy.ini @@ -0,0 +1,11 @@ +[mypy] +warn_unused_ignores = True +follow_imports = skip +show_error_context = True +warn_incomplete_stub = True +ignore_missing_imports = True +check_untyped_defs = True +cache_dir = /dev/null +warn_redundant_casts = True +warn_unused_configs = True +strict_optional = True diff --git a/packages/ml_api/requirements.txt b/packages/ml_api/requirements/requirements.txt similarity index 94% rename from packages/ml_api/requirements.txt rename to packages/ml_api/requirements/requirements.txt index 47d3df0..70af5a4 100644 --- a/packages/ml_api/requirements.txt +++ b/packages/ml_api/requirements/requirements.txt @@ -8,4 +8,4 @@ connexion[swagger-ui]>=2.5.1,<2.6.0 # repo maintenance tooling black>=19.10b0,<20.0 flake8>=3.7.9,<4.0 -mypy>=0.740 \ No newline at end of file +mypy>=0.740 diff --git a/packages/ml_api/requirements/test_requirements.txt b/packages/ml_api/requirements/test_requirements.txt new file mode 100644 index 0000000..5a3a35f --- /dev/null +++ b/packages/ml_api/requirements/test_requirements.txt @@ -0,0 +1,9 @@ +-r requirements.txt + +# testing requirements +pytest>=5.3.2,<6.0.0 + +# repo maintenance tooling +black>=19.10b0,<20.0 +flake8>=3.7.9,<4.0 +mypy>=0.740 \ No newline at end of file diff --git a/packages/ml_api/tests/__init__.py b/packages/ml_api/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/tests/conftest.py b/packages/ml_api/tests/conftest.py new file mode 100644 index 0000000..c42f56a --- /dev/null +++ b/packages/ml_api/tests/conftest.py @@ -0,0 +1,17 @@ +import pytest + +from api.config import TestingConfig +from api.app import create_app + + +@pytest.fixture(scope='session') +def app(): + app = create_app(config_object=TestingConfig()).app + with app.app_context(): + yield app + + +@pytest.fixture +def client(app): + with app.test_client() as client: + yield client # Has to be yielded to access session cookies diff --git a/packages/ml_api/tests/test_api.py b/packages/ml_api/tests/test_api.py new file mode 100644 index 0000000..93dfa16 --- /dev/null +++ b/packages/ml_api/tests/test_api.py @@ -0,0 +1,84 @@ +import json + +import numpy as np +import pytest +from gradient_boosting_model.processing.data_management import load_dataset + + +@pytest.mark.integration +def test_health_endpoint(client): + # When + response = client.get("/") + + # Then + assert response.status_code == 200 + assert json.loads(response.data) == {"status": "ok"} + + +@pytest.mark.integration +def test_prediction_endpoint(client): + # Given + # Load the test dataset which is included in the model package + test_inputs_df = load_dataset(file_name="test.csv") # dataframe + input_length = len(test_inputs_df) # test csv contains 1459 rows + expected_output_length = input_length - 2 # we expect 2 rows to be filtered + + # When + response = client.post( + "/v1/predictions", json=test_inputs_df.to_dict(orient="records") + ) + + # Then + assert response.status_code == 200 + data = json.loads(response.data) + assert data["errors"] is None + assert len(data["predictions"]) == expected_output_length + + +# parameterizationa allows us to try many combinations of data +# within the same test, see the pytest docs for details: +# https://docs.pytest.org/en/latest/parametrize.html +@pytest.mark.parametrize( + "field, field_value, index, expected_error", + ( + ( + "BldgType", + 1, # expected str + 33, + {"33": {"BldgType": ["Not a valid string."]}}, + ), + ( + "GarageArea", # model feature + "abc", # expected float + 45, + {"45": {"GarageArea": ["Not a valid number."]}}, + ), + ( + "CentralAir", + np.nan, # nan not allowed + 34, + {"34": {"CentralAir": ["Field may not be null."]}}, + ), + ("LotArea", "", 2, {"2": {"LotArea": ["Not a valid integer."]}},), + ), +) +@pytest.mark.integration +def test_prediction_validation(field, field_value, index, expected_error, client): + # Given + # Load the test dataset which is included in the model package + test_inputs_df = load_dataset(file_name="test.csv") # dataframe + + # Check gradient_boosting_model.processing.validation import HouseDataInputSchema + # and you will see the expected values for the inputs to the house price prediction + # model. In this test, inputs are changed to incorrect values to check the validation. + test_inputs_df.loc[index, field] = field_value + + # When + response = client.post( + "/v1/predictions", json=test_inputs_df.to_dict(orient="records") + ) + + # Then + assert response.status_code == 400 + data = json.loads(response.data) + assert data == expected_error diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini new file mode 100644 index 0000000..d0101ee --- /dev/null +++ b/packages/ml_api/tox.ini @@ -0,0 +1,63 @@ +[tox] +envlist = integration_tests,typechecks,stylechecks +skipsdist = True + + +[testenv] +install_command = pip install {opts} {packages} + +deps = + -rrequirements/test_requirements.txt + +commands= + py.test + + +[testenv:integration_tests] +envdir = {toxworkdir}/integration_tests +deps = + {[testenv]deps} + +setenv = + PYTHONPATH=. + +commands = + pytest \ + -s \ + -vv \ + -m integration \ + {posargs:tests/} + + +[testenv:typechecks] +envdir = {toxworkdir}/integration_tests + +deps = + {[testenv:integration_tests]deps} + +commands = {posargs:mypy api} + + +[testenv:stylechecks] +envdir = {toxworkdir}/integration_tests + +deps = + {[testenv:integration_tests]deps} + +commands = {posargs:flake8 api tests} + + +[flake8] +exclude = .git,env +max-line-length = 90 + + +[pytest] +markers = + integration: mark a test as an integration test. + +filterwarnings = + ignore::DeprecationWarning + ignore::RuntimeWarning + ignore::UserWarning + ignore::FutureWarning \ No newline at end of file From cf7a3f6b629d341f0fdf1bafa81c6e2610c4ebdc Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 7 Dec 2019 10:32:43 +0000 Subject: [PATCH 08/26] Advanced Testing Production ML Code - Create Differential Tests --- packages/ml_api/api/controller.py | 24 +++++++++ packages/ml_api/api/spec/api.yaml | 22 +++++++- packages/ml_api/requirements/requirements.txt | 3 ++ .../tests/differential_tests/__init__.py | 0 .../tests/differential_tests/compare.py | 50 +++++++++++++++++++ .../test_back_to_back_models.py | 37 ++++++++++++++ packages/ml_api/tests/test_api.py | 36 ++++++++++--- packages/ml_api/tox.ini | 21 +++++++- 8 files changed, 184 insertions(+), 9 deletions(-) create mode 100644 packages/ml_api/tests/differential_tests/__init__.py create mode 100644 packages/ml_api/tests/differential_tests/compare.py create mode 100644 packages/ml_api/tests/differential_tests/test_back_to_back_models.py diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py index 5e425c6..5da3d97 100644 --- a/packages/ml_api/api/controller.py +++ b/packages/ml_api/api/controller.py @@ -1,5 +1,6 @@ from flask import request, jsonify, Response from gradient_boosting_model.predict import make_prediction +from regression_model.predict import make_prediction as make_regression_prediction import json @@ -30,3 +31,26 @@ def predict(): return jsonify( {"predictions": predictions, "version": version, "errors": errors} ) + + +def predict_previous(): + if request.method == "POST": + # Step 1: Extract POST data from request body as JSON + json_data = request.get_json() + + # Step 2: Access the model prediction function (also validates data) + result = make_regression_prediction(input_data=json_data) + + # Step 3: Handle errors + errors = result.get("errors") + if errors: + return Response(json.dumps(errors), status=400) + + # Step 4: Split out results + predictions = result.get("predictions").tolist() + version = result.get("version") + + # Step 5: Prepare prediction response + return jsonify( + {"predictions": predictions, "version": version, "errors": errors} + ) diff --git a/packages/ml_api/api/spec/api.yaml b/packages/ml_api/api/spec/api.yaml index cd33cf5..0b95033 100644 --- a/packages/ml_api/api/spec/api.yaml +++ b/packages/ml_api/api/spec/api.yaml @@ -19,7 +19,7 @@ paths: '200': description: API Health Status - /v1/predictions: + /v1/predictions/gradient: post: operationId: api.controller.predict requestBody: @@ -39,6 +39,26 @@ paths: '5XX': description: Unexpected error + /v1/predictions/regression: + post: + operationId: api.controller.predict_previous + requestBody: + description: House details used to make price prediction + required: true + content: + application/json: + schema: + type: array + items: + $ref: '#/components/schemas/HouseDetails' + responses: + '200': + description: House Price Predictions + '400': + description: Bad request, house data validation failed + '5XX': + description: Unexpected error + components: schemas: HouseDetails: diff --git a/packages/ml_api/requirements/requirements.txt b/packages/ml_api/requirements/requirements.txt index 70af5a4..ae99f24 100644 --- a/packages/ml_api/requirements/requirements.txt +++ b/packages/ml_api/requirements/requirements.txt @@ -1,6 +1,9 @@ # ML Model tid-gradient-boosting-model>=0.1.18,<0.2.0 +# Old model +tid-regression-model>=2.0.20,<2.1.0 + # Web microframework for the API flask>=1.1.1,<1.2.0 connexion[swagger-ui]>=2.5.1,<2.6.0 diff --git a/packages/ml_api/tests/differential_tests/__init__.py b/packages/ml_api/tests/differential_tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/tests/differential_tests/compare.py b/packages/ml_api/tests/differential_tests/compare.py new file mode 100644 index 0000000..1610a27 --- /dev/null +++ b/packages/ml_api/tests/differential_tests/compare.py @@ -0,0 +1,50 @@ +import typing as t +import math + + +def compare_differences( + *, + expected_predictions: t.List, + actual_predictions: t.List, + rel_tol: t.Optional[float] = None, + abs_tol: t.Optional[float] = None, +) -> None: + """ + :param rel_tol: is the relative tolerance – it is the maximum allowed difference + between a and b, relative to the larger absolute value of a or b. + For example, to set a tolerance of 5%, pass rel_tol=0.05. The default + tolerance is 1e-09, which assures that the two values are the same within + about 9 decimal digits. rel_tol must be greater than zero. + + :param abs_tol: abs_tol is the minimum absolute tolerance – useful for comparisons + near zero. abs_tol must be at least zero. + """ + only_in_expected = len(expected_predictions) - len(actual_predictions) + + if only_in_expected: + raise ValueError(f"Missing {only_in_expected} predictions") + + only_in_actual = len(actual_predictions) - len(expected_predictions) + + if only_in_actual: + raise ValueError(f"Found {only_in_actual} unexpected predictions") + + thresholds = {} + + if abs_tol is not None: + thresholds["abs_tol"] = abs_tol + + if rel_tol is not None: + thresholds["rel_tol"] = rel_tol + + for index, (actual_prediction, expected_prediction) in enumerate( + zip(actual_predictions, expected_predictions) + ): + + if not math.isclose(expected_prediction, actual_prediction, **thresholds): + raise ValueError( + f"Price prediction {index} has changed by more " + f"than the thresholds: {thresholds}: " + f"{expected_prediction} (expected) vs " + f"{actual_prediction} (actual)" + ) diff --git a/packages/ml_api/tests/differential_tests/test_back_to_back_models.py b/packages/ml_api/tests/differential_tests/test_back_to_back_models.py new file mode 100644 index 0000000..85f1f67 --- /dev/null +++ b/packages/ml_api/tests/differential_tests/test_back_to_back_models.py @@ -0,0 +1,37 @@ +import json + +import pytest +from gradient_boosting_model.processing.data_management import load_dataset + +from tests.test_api import SECONDARY_VARIABLES_TO_RENAME +from .compare import compare_differences + + +@pytest.mark.differential +def test_model_prediction_differentials(client): + test_inputs_df = load_dataset(file_name="test.csv") + old_model_inputs_df = test_inputs_df.rename( + columns=SECONDARY_VARIABLES_TO_RENAME + ) + + new_model_response = client.post( + "v1/predictions/gradient", json=test_inputs_df.to_dict(orient="records") + ) + new_model_predictions = json.loads(new_model_response.data)["predictions"] + + old_model_response = client.post( + "v1/predictions/regression", + json=old_model_inputs_df.to_dict(orient="records"), + ) + old_model_predictions = json.loads(old_model_response.data)["predictions"] + + # We just pass in the first 10 rows as the two models' validation differs + # which means they filter out a slightly different number of rows + # which would cause the differential tests to fail. + compare_differences( + expected_predictions=new_model_predictions[:10], + actual_predictions=old_model_predictions[:10], + # you would adjust the rel_tol level parameter on your model. + # right now this is extremely permissive of variation. + rel_tol=0.2, + ) diff --git a/packages/ml_api/tests/test_api.py b/packages/ml_api/tests/test_api.py index 93dfa16..91ff466 100644 --- a/packages/ml_api/tests/test_api.py +++ b/packages/ml_api/tests/test_api.py @@ -5,6 +5,13 @@ from gradient_boosting_model.processing.data_management import load_dataset +SECONDARY_VARIABLES_TO_RENAME = { + "FirstFlrSF": "1stFlrSF", + "SecondFlrSF": "2ndFlrSF", + "ThreeSsnPortch": "3SsnPorch", +} + + @pytest.mark.integration def test_health_endpoint(client): # When @@ -16,23 +23,40 @@ def test_health_endpoint(client): @pytest.mark.integration -def test_prediction_endpoint(client): +@pytest.mark.parametrize( + "api_endpoint, expected_no_predictions", + ( + ( + "v1/predictions/gradient", + # test csv contains 1459 rows + # we expect 2 rows to be filtered + 1457, + ), + ( + "v1/predictions/regression", + # we expect 8 rows to be filtered + 1451, + ), + ), +) +def test_prediction_endpoint(api_endpoint, expected_no_predictions, client): # Given # Load the test dataset which is included in the model package test_inputs_df = load_dataset(file_name="test.csv") # dataframe - input_length = len(test_inputs_df) # test csv contains 1459 rows - expected_output_length = input_length - 2 # we expect 2 rows to be filtered + if api_endpoint == "v1/predictions/regression": + # adjust column names to those expected by the secondary model + test_inputs_df.rename(columns=SECONDARY_VARIABLES_TO_RENAME, inplace=True) # When response = client.post( - "/v1/predictions", json=test_inputs_df.to_dict(orient="records") + api_endpoint, json=test_inputs_df.to_dict(orient="records") ) # Then assert response.status_code == 200 data = json.loads(response.data) assert data["errors"] is None - assert len(data["predictions"]) == expected_output_length + assert len(data["predictions"]) == expected_no_predictions # parameterizationa allows us to try many combinations of data @@ -75,7 +99,7 @@ def test_prediction_validation(field, field_value, index, expected_error, client # When response = client.post( - "/v1/predictions", json=test_inputs_df.to_dict(orient="records") + "/v1/predictions/gradient", json=test_inputs_df.to_dict(orient="records") ) # Then diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini index d0101ee..1a8ecc2 100644 --- a/packages/ml_api/tox.ini +++ b/packages/ml_api/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = integration_tests,typechecks,stylechecks +envlist = integration_tests,differential_tests,typechecks,stylechecks skipsdist = True @@ -29,6 +29,22 @@ commands = {posargs:tests/} +[testenv:differential_tests] +envdir = {toxworkdir}/integration_tests +deps = + {[testenv]deps} + +setenv = + PYTHONPATH=. + +commands = + pytest \ + -s \ + -vv \ + -m differential \ + {posargs:tests/} + + [testenv:typechecks] envdir = {toxworkdir}/integration_tests @@ -55,9 +71,10 @@ max-line-length = 90 [pytest] markers = integration: mark a test as an integration test. + differential: mark a test as a differential test. filterwarnings = ignore::DeprecationWarning ignore::RuntimeWarning ignore::UserWarning - ignore::FutureWarning \ No newline at end of file + ignore::FutureWarning From 110fdc70ed2e2e58d79e475b2ef381c1ed64bc8b Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 8 Dec 2019 10:41:56 +0000 Subject: [PATCH 09/26] Advanced Testing Production ML Code - Create Differential Tests In Docker --- .gitignore | 4 + packages/ml_api/Makefile | 18 +- .../differential_tests/__init__.py | 0 .../ml_api/differential_tests/__main__.py | 100 ++++ .../{tests => }/differential_tests/compare.py | 49 +- .../sample_payloads/sample_input1.json | 488 ++++++++++++++++++ packages/ml_api/docker/Dockerfile.test | 21 + .../docker/docker-compose-ci-candidate.yml | 20 + .../docker/docker-compose-ci-master.yml | 20 + packages/ml_api/docker/docker-compose.yml | 1 + .../ml_api/requirements/test_requirements.txt | 6 +- packages/ml_api/scripts/differential_tests.sh | 58 +++ .../test_back_to_back_models.py | 2 +- 13 files changed, 782 insertions(+), 5 deletions(-) rename packages/ml_api/{tests => }/differential_tests/__init__.py (100%) create mode 100644 packages/ml_api/differential_tests/__main__.py rename packages/ml_api/{tests => }/differential_tests/compare.py (55%) create mode 100644 packages/ml_api/differential_tests/sample_payloads/sample_input1.json create mode 100644 packages/ml_api/docker/Dockerfile.test create mode 100644 packages/ml_api/docker/docker-compose-ci-candidate.yml create mode 100644 packages/ml_api/docker/docker-compose-ci-master.yml create mode 100755 packages/ml_api/scripts/differential_tests.sh rename packages/ml_api/tests/{differential_tests => }/test_back_to_back_models.py (95%) diff --git a/.gitignore b/.gitignore index 82d285c..f887dad 100644 --- a/.gitignore +++ b/.gitignore @@ -125,3 +125,7 @@ test.csv # trained models packages/gradient_boosting_model/gradient_boosting_model/trained_models/*.pkl *.h5 + +# differential test artifacts +packages/ml_api/differential_tests/expected_results/ +packages/ml_api/differential_tests/actual_results/ diff --git a/packages/ml_api/Makefile b/packages/ml_api/Makefile index bca844b..d8e26da 100644 --- a/packages/ml_api/Makefile +++ b/packages/ml_api/Makefile @@ -1,7 +1,23 @@ # For details on Makefiles, see the section notes. +NAME=ml_api +VERSION=$(shell git rev-parse HEAD) +REPO=UPDATEME +PASSWORD=UPDATEME # Specify phony list to ensure make recipes do not conflict with real file names -.PHONY: run-service-development +.PHONY: run-service-development tag-push-master tag-push-local + +tag-push-local: + @echo "+ $@" + docker login --username $(REPO) --password $(PASSWORD) + env TARGET=$(VERSION) docker-compose -f docker/docker-compose-ci-candidate.yml build + docker push $(REPO)/$(NAME):$(VERSION) + +tag-push-master: + @echo "+ $@" + docker login --username $(REPO) --password $(PASSWORD) + env TARGET=master docker-compose -f docker/docker-compose-ci-master.yml build + docker push $(REPO)/$(NAME):master # start up Flask API service run-service-development: diff --git a/packages/ml_api/tests/differential_tests/__init__.py b/packages/ml_api/differential_tests/__init__.py similarity index 100% rename from packages/ml_api/tests/differential_tests/__init__.py rename to packages/ml_api/differential_tests/__init__.py diff --git a/packages/ml_api/differential_tests/__main__.py b/packages/ml_api/differential_tests/__main__.py new file mode 100644 index 0000000..cf86222 --- /dev/null +++ b/packages/ml_api/differential_tests/__main__.py @@ -0,0 +1,100 @@ +import json +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Mapping + +from differential_tests.compare import compare_predictions +from api.config import ROOT + +from termcolor import cprint +from yarl import URL +import requests + +Marginals = Mapping[str, Mapping[str, float]] + + +def parse_args() -> Namespace: + parser = ArgumentParser() + + subparsers = parser.add_subparsers(dest="command") + + compute_parser = subparsers.add_parser( + "compute", help="Compute the predictions for a test set" + ) + compute_parser.add_argument( + "--base-url", + default=URL("http://0.0.0.0:5000"), + type=URL, + help="Base URL of the service to test", + ) + compute_parser.add_argument( + "tests_dir", type=Path, help="Directory containing the test set to use" + ) + compute_parser.add_argument( + "results_dir", type=Path, help="Directory to save the prediction results to" + ) + + compare_parser = subparsers.add_parser( + "compare", help="Compare the actual results with the expected results" + ) + compare_parser.add_argument( + "--absolute-tolerance", + dest="abs_tol", + metavar="X", + type=float, + help="math.isclose(a, b, abs_tol=X)", + default=1e-5, + ) + compare_parser.add_argument( + "--relative-tolerance", + dest="rel_tol", + metavar="X", + type=float, + default=1e-5, + help="math.isclose(a, b, rel_tol=X)", + ) + compare_parser.add_argument( + "expected_results_dir", + type=Path, + help="Directory containing the expected results", + ) + compare_parser.add_argument( + "actual_results_dir", type=Path, help="Directory containing the actual results" + ) + + return parser.parse_args() + + +def main(args: Namespace) -> None: + if args.command == "compute": + compute_predictions(args) + elif args.command == "compare": + compare_predictions(args) + + +def compute_predictions(args: Namespace) -> None: + print("computing") + + diff_test_dir = ROOT / "differential_tests" + results_dir = args.results_dir + results_dir.mkdir(parents=True, exist_ok=True) + prepared_test_dir = diff_test_dir / Path(args.tests_dir) + + for test_filename in sorted(prepared_test_dir.glob("*.json")): + results_filename = results_dir / test_filename.name + print(f"Computing {results_filename} from {test_filename} ... ", end="") + + with test_filename.open() as f: + test = json.load(f) + + results = requests.post(f"{args.base_url}/v1/predictions/primary", json=test) + + with results_filename.open("w") as f: + json.dump(results.json(), f, indent=2, sort_keys=True) + + cprint("OK", "green") + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/packages/ml_api/tests/differential_tests/compare.py b/packages/ml_api/differential_tests/compare.py similarity index 55% rename from packages/ml_api/tests/differential_tests/compare.py rename to packages/ml_api/differential_tests/compare.py index 1610a27..012dd16 100644 --- a/packages/ml_api/tests/differential_tests/compare.py +++ b/packages/ml_api/differential_tests/compare.py @@ -1,5 +1,12 @@ -import typing as t +import json import math +import sys +import typing as t +from argparse import Namespace + +from termcolor import cprint + +from api.config import ROOT def compare_differences( @@ -40,7 +47,6 @@ def compare_differences( for index, (actual_prediction, expected_prediction) in enumerate( zip(actual_predictions, expected_predictions) ): - if not math.isclose(expected_prediction, actual_prediction, **thresholds): raise ValueError( f"Price prediction {index} has changed by more " @@ -48,3 +54,42 @@ def compare_differences( f"{expected_prediction} (expected) vs " f"{actual_prediction} (actual)" ) + + +def compare_predictions(args: Namespace) -> None: + expected_results_dir = ROOT / args.expected_results_dir + actual_results_dir = ROOT / args.actual_results_dir + + expected_results_filenames = list(expected_results_dir.glob("*.json")) + + if not expected_results_filenames: + print("No results found!") + sys.exit(1) + + for expected_results_filename in sorted(expected_results_filenames): + name = expected_results_filename.name + actual_results_filename = actual_results_dir / name + + print( + f"Comparing {expected_results_filename} with {actual_results_filename} ... ", + end="", + ) + + with expected_results_filename.open() as f: + expected_results = json.load(f) + + with actual_results_filename.open() as f: + actual_results = json.load(f) + + try: + compare_differences( + expected_predictions=expected_results["predictions"], + actual_predictions=actual_results["predictions"], + rel_tol=args.rel_tol, + abs_tol=args.abs_tol, + ) + except ValueError as exc: + cprint("ERROR", "red") + cprint(f" • {exc}", "red") + else: + cprint("OK", "green") diff --git a/packages/ml_api/differential_tests/sample_payloads/sample_input1.json b/packages/ml_api/differential_tests/sample_payloads/sample_input1.json new file mode 100644 index 0000000..61f96e6 --- /dev/null +++ b/packages/ml_api/differential_tests/sample_payloads/sample_input1.json @@ -0,0 +1,488 @@ +[{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 11622, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1961, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 896, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +}, { + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 11689, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 752, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +}, +{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 22689, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 752, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +},{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 11689, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 988, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +},{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 11689, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 752, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2008, + "SaleType": "WD", + "SaleCondition": "Normal" +},{ + "Id": 1461, + "MSSubClass": 20, + "MSZoning": "RH", + "LotFrontage": 80.0, + "LotArea": 25000, + "Street": "Pave", + "Alley": null, + "LotShape": "Reg", + "LandContour": "Lvl", + "Utilities": "AllPub", + "LotConfig": "Inside", + "LandSlope": "Gtl", + "Neighborhood": "NAmes", + "Condition1": "Feedr", + "Condition2": "Norm", + "BldgType": "1Fam", + "HouseStyle": "1Story", + "OverallQual": 5, + "OverallCond": 6, + "YearBuilt": 1969, + "YearRemodAdd": 1961, + "RoofStyle": "Gable", + "RoofMatl": "CompShg", + "Exterior1st": "VinylSd", + "Exterior2nd": "VinylSd", + "MasVnrType": "None", + "MasVnrArea": 0.0, + "ExterQual": "TA", + "ExterCond": "TA", + "Foundation": "CBlock", + "BsmtQual": "TA", + "BsmtCond": "TA", + "BsmtExposure": "No", + "BsmtFinType1": "Rec", + "BsmtFinSF1": 468.0, + "BsmtFinType2": "LwQ", + "BsmtFinSF2": 144.0, + "BsmtUnfSF": 270.0, + "TotalBsmtSF": 882.0, + "Heating": "GasA", + "HeatingQC": "TA", + "CentralAir": "Y", + "Electrical": "SBrkr", + "1stFlrSF": 752, + "2ndFlrSF": 0, + "LowQualFinSF": 0, + "GrLivArea": 896, + "BsmtFullBath": 0.0, + "BsmtHalfBath": 0.0, + "FullBath": 1, + "HalfBath": 0, + "BedroomAbvGr": 2, + "KitchenAbvGr": 1, + "KitchenQual": "TA", + "TotRmsAbvGrd": 5, + "Functional": "Typ", + "Fireplaces": 0, + "FireplaceQu": null, + "GarageType": "Attchd", + "GarageYrBlt": 1961.0, + "GarageFinish": "Unf", + "GarageCars": 1.0, + "GarageArea": 730.0, + "GarageQual": "TA", + "GarageCond": "TA", + "PavedDrive": "Y", + "WoodDeckSF": 140, + "OpenPorchSF": 0, + "EnclosedPorch": 0, + "3SsnPorch": 0, + "ScreenPorch": 120, + "PoolArea": 0, + "PoolQC": null, + "Fence": "MnPrv", + "MiscFeature": null, + "MiscVal": 0, + "MoSold": 6, + "YrSold": 2010, + "SaleType": "WD", + "SaleCondition": "Normal" +}] \ No newline at end of file diff --git a/packages/ml_api/docker/Dockerfile.test b/packages/ml_api/docker/Dockerfile.test new file mode 100644 index 0000000..77da899 --- /dev/null +++ b/packages/ml_api/docker/Dockerfile.test @@ -0,0 +1,21 @@ +FROM python:3.7.5-slim-buster + +RUN mkdir -p /opt/app +COPY requirements /opt/app/requirements +RUN pip install --upgrade pip + +# ensure we can run the make commands +RUN apt-get update -y && \ + apt-get install -y make && \ + apt-get install -y libffi-dev gcc && \ + # for swagger + apt-get install -y curl + +ENV PYTHONPATH "${PYTHONPATH}:/opt/app" +RUN pip install -r /opt/app/requirements/test_requirements.txt +COPY Makefile /opt/app/Makefile +COPY api /opt/app/api +COPY run.py /opt/app/run.py +COPY differential_tests /opt/app/differential_tests +COPY tests /opt/app/tests +WORKDIR /opt/app diff --git a/packages/ml_api/docker/docker-compose-ci-candidate.yml b/packages/ml_api/docker/docker-compose-ci-candidate.yml new file mode 100644 index 0000000..47bbc16 --- /dev/null +++ b/packages/ml_api/docker/docker-compose-ci-candidate.yml @@ -0,0 +1,20 @@ +version: '3' +services: + + ml_api: + image: christophergs/ml_api:${TARGET} + environment: + SERVER_PORT: ${SERVER_PORT:-5001} + build: + context: ../ + dockerfile: docker/Dockerfile.test + ports: + - "5001:5001" + tty: true + command: bash -c "make run-service-development" + + differential-tests: + image: christophergs/ml_api:${TARGET} + command: ["true"] + depends_on: + - ml_api \ No newline at end of file diff --git a/packages/ml_api/docker/docker-compose-ci-master.yml b/packages/ml_api/docker/docker-compose-ci-master.yml new file mode 100644 index 0000000..c0844e3 --- /dev/null +++ b/packages/ml_api/docker/docker-compose-ci-master.yml @@ -0,0 +1,20 @@ +version: '3' +services: + + ml_api: + image: christophergs/ml_api:${TARGET} + environment: + SERVER_PORT: ${SERVER_PORT:-5000} + build: + context: ../ + dockerfile: docker/Dockerfile.test + ports: + - "5000:5000" + tty: true + command: bash -c "make run-service-development" + + differential-tests: + image: christophergs/ml_api:${TARGET} + command: ["true"] + depends_on: + - ml_api \ No newline at end of file diff --git a/packages/ml_api/docker/docker-compose.yml b/packages/ml_api/docker/docker-compose.yml index efe4b10..8c9334c 100644 --- a/packages/ml_api/docker/docker-compose.yml +++ b/packages/ml_api/docker/docker-compose.yml @@ -1,6 +1,7 @@ version: '3' services: ml_api: + image: christophergs/ml_api:master build: context: ../ dockerfile: docker/Dockerfile diff --git a/packages/ml_api/requirements/test_requirements.txt b/packages/ml_api/requirements/test_requirements.txt index 5a3a35f..2a0a147 100644 --- a/packages/ml_api/requirements/test_requirements.txt +++ b/packages/ml_api/requirements/test_requirements.txt @@ -6,4 +6,8 @@ pytest>=5.3.2,<6.0.0 # repo maintenance tooling black>=19.10b0,<20.0 flake8>=3.7.9,<4.0 -mypy>=0.740 \ No newline at end of file +mypy>=0.740 + +# diff test tooling +termcolor==1.1.0 +yarl==1.3.0 \ No newline at end of file diff --git a/packages/ml_api/scripts/differential_tests.sh b/packages/ml_api/scripts/differential_tests.sh new file mode 100755 index 0000000..98c8a91 --- /dev/null +++ b/packages/ml_api/scripts/differential_tests.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +set -euox pipefail + +MODEL_VERSION="master" +MODEL_VARIANT="candidate" +NUMBER_OF_TESTS="50" + +CANDIDATE_MODEL_SHA="$(git rev-parse HEAD)" + +# required once only (or whenever you make local changes): +# comment these two lines out otherwise as they can take some time. +make tag-push-local + +# should only be run once a model version has been finalized +# best practice is to run as part of a CI pipeline on merge to master branch. +make tag-push-master + +## Pull latest published image +env TARGET=master docker-compose --file docker/docker-compose.yml pull + +# start latest (master) image and local image +env TARGET=master SERVER_PORT=5000 docker-compose --project-name master --file docker/docker-compose-ci-master.yml up --no-recreate -d ml_api +env TARGET=$CANDIDATE_MODEL_SHA SERVER_PORT=5001 docker-compose --project-name head --file docker/docker-compose-ci-candidate.yml up --no-recreate -d ml_api + +## Start the test runner containers +env TARGET=master docker-compose --project-name master --file docker/docker-compose-ci-master.yml run -d --name differential-tests-expected differential-tests sleep infinity +env TARGET=$CANDIDATE_MODEL_SHA docker-compose --project-name head --file docker/docker-compose-ci-candidate.yml run -d --name differential-tests-actual differential-tests sleep infinity + +docker ps --all + +echo "===== Running $CANDIDATE_MODEL_SHA ... =====" + +## Compute the actual predictions (i.e. candidate model) +docker exec --user root differential-tests-actual \ + python3 differential_tests compute sample_payloads differential_tests/actual_results --base-url http://head_ml_api_1:5001 + +## Copy the actual predictions +docker cp differential-tests-actual:/opt/app/differential_tests/actual_results/. differential_tests/actual_results + +echo "===== Running master ... =====" +## Compute the expected marginals (i.e. existing model) +docker exec --user root differential-tests-expected \ + python3 differential_tests compute sample_payloads differential_tests/expected_results --base-url http://master_ml_api_1:5000 + +## Copy the expected marginals +docker cp differential-tests-expected:/opt/app/differential_tests/expected_results/. differential_tests/expected_results + +# then copy all results into the differential-tests-actual container for comparison +docker cp differential_tests/expected_results/. differential-tests-actual:/opt/app/differential_tests/expected_results + +echo "===== Comparing $CANDIDATE_MODEL_SHA vs. master ... =====" +## Compare the expected and actual marginals +docker exec differential-tests-actual \ + python3 -m differential_tests compare differential_tests/expected_results differential_tests/actual_results + +# clear any docker containers (will stop the script if no containers found) +docker rm $(docker ps -a -q) -f diff --git a/packages/ml_api/tests/differential_tests/test_back_to_back_models.py b/packages/ml_api/tests/test_back_to_back_models.py similarity index 95% rename from packages/ml_api/tests/differential_tests/test_back_to_back_models.py rename to packages/ml_api/tests/test_back_to_back_models.py index 85f1f67..9533438 100644 --- a/packages/ml_api/tests/differential_tests/test_back_to_back_models.py +++ b/packages/ml_api/tests/test_back_to_back_models.py @@ -4,7 +4,7 @@ from gradient_boosting_model.processing.data_management import load_dataset from tests.test_api import SECONDARY_VARIABLES_TO_RENAME -from .compare import compare_differences +from differential_tests.compare import compare_differences @pytest.mark.differential From 889139762424e4d925e95c1ce7e27a8e6271453b Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 15 Dec 2019 17:22:54 +0000 Subject: [PATCH 10/26] Shadow Mode ML Code - Initial Setup --- packages/ml_api/.dockerignore | 3 + packages/ml_api/Makefile | 7 +- packages/ml_api/alembic.ini | 49 ++++++++++++ packages/ml_api/alembic/env.py | 63 +++++++++++++++ packages/ml_api/alembic/script.py.mako | 24 ++++++ .../cf4abb13368d_create_prediction_tables.py | 78 +++++++++++++++++++ packages/ml_api/api/app.py | 6 +- packages/ml_api/api/config.py | 4 + packages/ml_api/api/controller.py | 17 +++- packages/ml_api/api/persistence/__init__.py | 0 packages/ml_api/api/persistence/core.py | 53 +++++++++++++ .../ml_api/api/persistence/data_access.py | 53 +++++++++++++ packages/ml_api/api/persistence/models.py | 29 +++++++ packages/ml_api/docker/Dockerfile | 7 +- packages/ml_api/docker/docker-compose.yml | 25 +++++- packages/ml_api/requirements/requirements.txt | 6 ++ packages/ml_api/tox.ini | 19 +++++ 17 files changed, 435 insertions(+), 8 deletions(-) create mode 100644 packages/ml_api/alembic.ini create mode 100644 packages/ml_api/alembic/env.py create mode 100644 packages/ml_api/alembic/script.py.mako create mode 100644 packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py create mode 100644 packages/ml_api/api/persistence/__init__.py create mode 100644 packages/ml_api/api/persistence/core.py create mode 100644 packages/ml_api/api/persistence/data_access.py create mode 100644 packages/ml_api/api/persistence/models.py diff --git a/packages/ml_api/.dockerignore b/packages/ml_api/.dockerignore index 8bd7171..26ab026 100644 --- a/packages/ml_api/.dockerignore +++ b/packages/ml_api/.dockerignore @@ -12,6 +12,9 @@ packages/gradient_boosting_model *.pytest_cache *.tox +# alembic +!alembic/env.py + # Byte-compiled / optimized / DLL files *__pycache__* *.py[cod] \ No newline at end of file diff --git a/packages/ml_api/Makefile b/packages/ml_api/Makefile index d8e26da..a3979c2 100644 --- a/packages/ml_api/Makefile +++ b/packages/ml_api/Makefile @@ -5,7 +5,8 @@ REPO=UPDATEME PASSWORD=UPDATEME # Specify phony list to ensure make recipes do not conflict with real file names -.PHONY: run-service-development tag-push-master tag-push-local +.PHONY: run-service-development tag-push-master tag-push-local db-migrations + tag-push-local: @echo "+ $@" @@ -23,3 +24,7 @@ tag-push-master: run-service-development: @echo "+ $@" python run.py + +db-migrations: + @echo "+ $@" + alembic -c alembic.ini upgrade head diff --git a/packages/ml_api/alembic.ini b/packages/ml_api/alembic.ini new file mode 100644 index 0000000..604e701 --- /dev/null +++ b/packages/ml_api/alembic.ini @@ -0,0 +1,49 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = alembic + +# timezone to use when rendering the date +# within the migration file as well as the filename. +# string value is passed to dateutil.tz.gettz() +# leave blank for localtime +timezone = UTC + +sqlalchemy.url = VALUE_IS_SET_AT_RUNTIME + + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/packages/ml_api/alembic/env.py b/packages/ml_api/alembic/env.py new file mode 100644 index 0000000..377cb2e --- /dev/null +++ b/packages/ml_api/alembic/env.py @@ -0,0 +1,63 @@ +import os + +from alembic import context +from sqlalchemy import engine_from_config, pool + +# Import the models so the changes in them are automatically reflected in the +# generated migrations. +from api.persistence import models # noqa +from api.config import DevelopmentConfig as user_config +from api.persistence.core import Base + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config +database_url = os.environ.get("ALEMBIC_DB_URI", user_config.SQLALCHEMY_DATABASE_URI) +config.set_main_option("sqlalchemy.url", database_url) + +# add your model's MetaData object here +# for 'autogenerate' support +target_metadata = Base.metadata + + +def run_migrations_offline(): + """Run migrations in 'offline' mode. + This configures the context with just a URL + and not a user_ratings, though a user_ratings is acceptable + here as well. By skipping the user_ratings creation + we don't even need a DBAPI to be available. + Calls to context.execute() here emit the given string to the + script output. + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, target_metadata=target_metadata, literal_binds=True, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online(): + """Run migrations in 'online' mode. + In this scenario we need to create a user_ratings + and associate a connection with the context. + """ + alembic_config = config.get_section(config.config_ini_section) + connectable = engine_from_config( + alembic_config, prefix="sqlalchemy.", poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata, + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/packages/ml_api/alembic/script.py.mako b/packages/ml_api/alembic/script.py.mako new file mode 100644 index 0000000..2c01563 --- /dev/null +++ b/packages/ml_api/alembic/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade(): + ${upgrades if upgrades else "pass"} + + +def downgrade(): + ${downgrades if downgrades else "pass"} diff --git a/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py b/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py new file mode 100644 index 0000000..a26fb19 --- /dev/null +++ b/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py @@ -0,0 +1,78 @@ +"""create prediction tables + +Revision ID: cf4abb13368d +Revises: +Create Date: 2019-12-15 14:54:07.857500+00:00 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "cf4abb13368d" +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "gradient_boosting_model_predictions", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.String(length=36), nullable=False), + sa.Column( + "datetime_captured", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ), + sa.Column("model_version", sa.String(length=36), nullable=False), + sa.Column("inputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column("outputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_gradient_boosting_model_predictions_datetime_captured"), + "gradient_boosting_model_predictions", + ["datetime_captured"], + unique=False, + ) + op.create_table( + "regression_model_predictions", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.String(length=36), nullable=False), + sa.Column( + "datetime_captured", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ), + sa.Column("model_version", sa.String(length=36), nullable=False), + sa.Column("inputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column("outputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_regression_model_predictions_datetime_captured"), + "regression_model_predictions", + ["datetime_captured"], + unique=False, + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index( + op.f("ix_regression_model_predictions_datetime_captured"), + table_name="regression_model_predictions", + ) + op.drop_table("regression_model_predictions") + op.drop_index( + op.f("ix_gradient_boosting_model_predictions_datetime_captured"), + table_name="gradient_boosting_model_predictions", + ) + op.drop_table("gradient_boosting_model_predictions") + # ### end Alembic commands ### diff --git a/packages/ml_api/api/app.py b/packages/ml_api/api/app.py index 81848d4..759dc41 100644 --- a/packages/ml_api/api/app.py +++ b/packages/ml_api/api/app.py @@ -3,6 +3,7 @@ import connexion from api.config import Config +from api.persistence.core import init_database _logger = logging.getLogger(__name__) @@ -16,8 +17,11 @@ def create_app(*, config_object: Config) -> connexion.App: ) flask_app = connexion_app.app flask_app.config.from_object(config_object) - connexion_app.add_api("api.yaml") + # Setup database + init_database(flask_app, config=config_object) + + connexion_app.add_api("api.yaml") _logger.info("Application instance created") return connexion_app diff --git a/packages/ml_api/api/config.py b/packages/ml_api/api/config.py index 6bb032a..5da5cc3 100644 --- a/packages/ml_api/api/config.py +++ b/packages/ml_api/api/config.py @@ -22,6 +22,10 @@ class Config: SERVER_PORT = int(os.getenv("SERVER_PORT", 5000)) SERVER_HOST = os.getenv("SERVER_HOST", "0.0.0.0") LOGGING_LEVEL = os.getenv("LOGGING_LEVEL", logging.INFO) + SQLALCHEMY_DATABASE_URI = ( + f"postgresql+psycopg2://{os.getenv('DB_USER')}:" + f"{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}/{os.getenv('DB_NAME')}" + ) class DevelopmentConfig(Config): diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py index 5da3d97..0f2763d 100644 --- a/packages/ml_api/api/controller.py +++ b/packages/ml_api/api/controller.py @@ -1,8 +1,10 @@ -from flask import request, jsonify, Response +import json + +from flask import request, jsonify, Response, current_app from gradient_boosting_model.predict import make_prediction from regression_model.predict import make_prediction as make_regression_prediction -import json +from api.persistence.data_access import PredictionPersistence, ModelType def health(): @@ -27,7 +29,16 @@ def predict(): predictions = result.get("predictions").tolist() version = result.get("version") - # Step 5: Prepare prediction response + # Step 5: Save predictions + persistence = PredictionPersistence(db_session=current_app.db_session) + persistence.save_predictions( + inputs=json_data, + model_version=version, + predictions=predictions, + db_model=ModelType.GRADIENT_BOOSTING, + ) + + # Step 6: Prepare prediction response return jsonify( {"predictions": predictions, "version": version, "errors": errors} ) diff --git a/packages/ml_api/api/persistence/__init__.py b/packages/ml_api/api/persistence/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/api/persistence/core.py b/packages/ml_api/api/persistence/core.py new file mode 100644 index 0000000..ac3793a --- /dev/null +++ b/packages/ml_api/api/persistence/core.py @@ -0,0 +1,53 @@ +import logging + +from flask import Flask +from sqlalchemy import create_engine +from sqlalchemy.engine import Engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import scoped_session, sessionmaker + +from api.config import Config + +_logger = logging.getLogger(__name__) + +# Base class for SQLAlchemy models +Base = declarative_base() + + +def create_db_engine_from_config(*, config: Config) -> Engine: + """The Engine is the starting point for any SQLAlchemy application. + + It’s “home base” for the actual database and its DBAPI, delivered to the SQLAlchemy + application through a connection pool and a Dialect, which describes how to talk to + a specific kind of database / DBAPI combination. + """ + + engine = create_engine(config.SQLALCHEMY_DATABASE_URI,) + + _logger.info(f"creating DB conn with URI: {config.SQLALCHEMY_DATABASE_URI}") + return engine + + +def create_db_session(*, engine: Engine) -> scoped_session: + """Broadly speaking, the Session establishes all conversations with the database. + + It represents a “holding zone” for all the objects which you’ve loaded or + associated with it during its lifespan. + """ + return scoped_session( + sessionmaker(autocommit=False, autoflush=False, bind=engine,), + ) + + +def init_database(app: Flask, config: Config, db_session=None) -> None: + """Connect to the database and attach DB session to the app.""" + + if not db_session: + engine = create_db_engine_from_config(config=config) + db_session = create_db_session(engine=engine) + + app.db_session = db_session + + @app.teardown_appcontext + def shutdown_session(exception=None): + db_session.remove() diff --git a/packages/ml_api/api/persistence/data_access.py b/packages/ml_api/api/persistence/data_access.py new file mode 100644 index 0000000..46fb88b --- /dev/null +++ b/packages/ml_api/api/persistence/data_access.py @@ -0,0 +1,53 @@ +import enum +import logging +import typing as t + +from sqlalchemy.orm.session import Session + +from api.persistence.models import ( + LassoModelPredictions, + GradientBoostingModelPredictions, +) + +_logger = logging.getLogger(__name__) + + +class ModelType(enum.Enum): + LASSO = "lasso" + GRADIENT_BOOSTING = "gradient_boosting" + + +class PredictionPersistence: + def __init__(self, *, db_session: Session, user_id: str = None) -> None: + self.db_session = db_session + if not user_id: + # in reality, here we would use something like a UUID for anonymous users + # and if we had user logins, we would record the user ID. + self.user_id = "007" + + def save_predictions( + self, + *, + inputs: t.List, + model_version: str, + predictions: t.List, + db_model: ModelType, + ) -> None: + if db_model == db_model.LASSO: + prediction_data = LassoModelPredictions( + user_id=self.user_id, + model_version=model_version, + inputs=inputs, + outputs=predictions, + ) + else: + prediction_data = GradientBoostingModelPredictions( + user_id=self.user_id, + model_version=model_version, + inputs=inputs, + outputs=predictions, + ) + + self.db_session.add(prediction_data) + self.db_session.commit() + _logger.debug(f"saved data for model: {db_model}") diff --git a/packages/ml_api/api/persistence/models.py b/packages/ml_api/api/persistence/models.py new file mode 100644 index 0000000..65da0b8 --- /dev/null +++ b/packages/ml_api/api/persistence/models.py @@ -0,0 +1,29 @@ +from sqlalchemy import Column, String, DateTime, Integer +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.sql import func + +from api.persistence.core import Base + + +class LassoModelPredictions(Base): + __tablename__ = "regression_model_predictions" + id = Column(Integer, primary_key=True) + user_id = Column(String(36), nullable=False) + datetime_captured = Column( + DateTime(timezone=True), server_default=func.now(), index=True + ) + model_version = Column(String(36), nullable=False) + inputs = Column(JSONB) + outputs = Column(JSONB) + + +class GradientBoostingModelPredictions(Base): + __tablename__ = "gradient_boosting_model_predictions" + id = Column(Integer, primary_key=True) + user_id = Column(String(36), nullable=False) + datetime_captured = Column( + DateTime(timezone=True), server_default=func.now(), index=True + ) + model_version = Column(String(36), nullable=False) + inputs = Column(JSONB) + outputs = Column(JSONB) diff --git a/packages/ml_api/docker/Dockerfile b/packages/ml_api/docker/Dockerfile index f374941..9c8f6c7 100644 --- a/packages/ml_api/docker/Dockerfile +++ b/packages/ml_api/docker/Dockerfile @@ -9,10 +9,15 @@ RUN apt-get update -y && \ apt-get install -y make && \ apt-get install -y libffi-dev gcc && \ # for swagger - apt-get install -y curl + apt-get install -y curl && \ + # for postgres driver + apt-get install -y libpq-dev RUN pip install -r /opt/app/requirements/requirements.txt +ENV PYTHONPATH "${PYTHONPATH}:/opt/app/" COPY Makefile /opt/app/Makefile COPY api /opt/app/api COPY run.py /opt/app/run.py +COPY alembic.ini /opt/app/alembic.ini +COPY alembic /opt/app/alembic WORKDIR /opt/app diff --git a/packages/ml_api/docker/docker-compose.yml b/packages/ml_api/docker/docker-compose.yml index 8c9334c..399c022 100644 --- a/packages/ml_api/docker/docker-compose.yml +++ b/packages/ml_api/docker/docker-compose.yml @@ -1,10 +1,31 @@ version: '3' services: ml_api: - image: christophergs/ml_api:master build: context: ../ dockerfile: docker/Dockerfile + environment: + DB_HOST: database + DB_PORT: 5432 + DB_USER: user + DB_PASSWORD: ${DB_PASSWORD:-password} + DB_NAME: ml_api + depends_on: + - database ports: - "5000:5000" # expose webserver to localhost host:container - command: bash -c "make run-service-development" + command: bash -c "make db-migrations && make run-service-development" + + database: + image: postgres:latest + environment: + POSTGRES_USER: user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api + ports: + - "6609:5432" + volumes: + - my_dbdata:/var/lib/postgresql/data + +volumes: + my_dbdata: diff --git a/packages/ml_api/requirements/requirements.txt b/packages/ml_api/requirements/requirements.txt index ae99f24..6d80ed5 100644 --- a/packages/ml_api/requirements/requirements.txt +++ b/packages/ml_api/requirements/requirements.txt @@ -12,3 +12,9 @@ connexion[swagger-ui]>=2.5.1,<2.6.0 black>=19.10b0,<20.0 flake8>=3.7.9,<4.0 mypy>=0.740 + +# Persistence + +sqlalchemy>=1.3.11,<1.4.0 # ORM +psycopg2>=2.8.4,<2.9.0 # DB Driver +alembic>=1.3.1,<1.4.0 # DB Migrations diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini index 1a8ecc2..674f073 100644 --- a/packages/ml_api/tox.ini +++ b/packages/ml_api/tox.ini @@ -9,6 +9,11 @@ install_command = pip install {opts} {packages} deps = -rrequirements/test_requirements.txt +passenv = +# A list of wildcard environment variable names which shall be copied from +# the tox invocation environment to the test environment when executing test commands + DB_* + commands= py.test @@ -18,8 +23,15 @@ envdir = {toxworkdir}/integration_tests deps = {[testenv]deps} +passenv = + {[testenv]passenv} + setenv = PYTHONPATH=. + DB_USER={env:DB_USER:user} + DB_PASSWORD={env:DB_PASSWORD:password} + DB_HOST={env:DB_HOST:localhost} + DB_NAME={env:DB_NAME:ml_api} commands = pytest \ @@ -34,8 +46,15 @@ envdir = {toxworkdir}/integration_tests deps = {[testenv]deps} +passenv = + {[testenv]passenv} + setenv = PYTHONPATH=. + DB_USER={env:DB_USER:user} + DB_PASSWORD={env:DB_PASSWORD:password} + DB_HOST={env:DB_HOST:localhost} + DB_NAME={env:DB_NAME:ml_api} commands = pytest \ From 33c83184d791cf98129ddf6562cbc8d233c71bfe Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 21 Dec 2019 08:51:51 +0000 Subject: [PATCH 11/26] Shadow Mode ML Code - Implementation and Tests --- packages/ml_api/Makefile | 2 +- packages/ml_api/api/app.py | 8 +- packages/ml_api/api/config.py | 27 ++++++- packages/ml_api/api/controller.py | 48 ++++++------ packages/ml_api/api/persistence/core.py | 26 ++++++- .../ml_api/api/persistence/data_access.py | 77 ++++++++++++++++--- packages/ml_api/api/spec/api.yaml | 4 +- packages/ml_api/docker/Dockerfile | 7 +- packages/ml_api/docker/Dockerfile.test | 7 +- .../ml_api/docker/docker-compose.test.yml | 33 ++++++++ packages/ml_api/docker/docker-compose.yml | 5 +- packages/ml_api/requirements/requirements.txt | 2 +- packages/ml_api/tests/conftest.py | 43 ++++++++++- packages/ml_api/tests/test_api.py | 54 +++++++++---- .../ml_api/tests/test_back_to_back_models.py | 2 +- packages/ml_api/tests/test_persistence.py | 36 +++++++++ packages/ml_api/tox.ini | 29 +++++-- 17 files changed, 330 insertions(+), 80 deletions(-) create mode 100644 packages/ml_api/docker/docker-compose.test.yml create mode 100644 packages/ml_api/tests/test_persistence.py diff --git a/packages/ml_api/Makefile b/packages/ml_api/Makefile index a3979c2..67a1214 100644 --- a/packages/ml_api/Makefile +++ b/packages/ml_api/Makefile @@ -27,4 +27,4 @@ run-service-development: db-migrations: @echo "+ $@" - alembic -c alembic.ini upgrade head + PYTHONPATH=. alembic -c alembic.ini upgrade head diff --git a/packages/ml_api/api/app.py b/packages/ml_api/api/app.py index 759dc41..ccb8e48 100644 --- a/packages/ml_api/api/app.py +++ b/packages/ml_api/api/app.py @@ -1,15 +1,17 @@ import logging import connexion +from sqlalchemy.orm import scoped_session from api.config import Config from api.persistence.core import init_database - _logger = logging.getLogger(__name__) -def create_app(*, config_object: Config) -> connexion.App: +def create_app( + *, config_object: Config, db_session: scoped_session = None +) -> connexion.App: """Create app instance.""" connexion_app = connexion.App( @@ -19,7 +21,7 @@ def create_app(*, config_object: Config) -> connexion.App: flask_app.config.from_object(config_object) # Setup database - init_database(flask_app, config=config_object) + init_database(flask_app, config=config_object, db_session=db_session) connexion_app.add_api("api.yaml") _logger.info("Application instance created") diff --git a/packages/ml_api/api/config.py b/packages/ml_api/api/config.py index 5da5cc3..05ae140 100644 --- a/packages/ml_api/api/config.py +++ b/packages/ml_api/api/config.py @@ -26,6 +26,12 @@ class Config: f"postgresql+psycopg2://{os.getenv('DB_USER')}:" f"{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}/{os.getenv('DB_NAME')}" ) + # DB config matches docker container + DB_USER = os.getenv("DB_USER", "user") + DB_PASSWORD = os.getenv("DB_PASSWORD", "password") + DB_PORT = os.getenv("DB_PORT", 6609) + DB_HOST = os.getenv("DB_HOST", "0.0.0.0") + DB_NAME = os.getenv("DB_NAME", "ml_api_dev") class DevelopmentConfig(Config): @@ -39,9 +45,28 @@ class TestingConfig(Config): TESTING = True LOGGING_LEVEL = logging.DEBUG + # DB config matches test docker container + DB_USER = os.getenv("DB_USER", "test_user") + DB_PASSWORD = os.getenv("DB_PASSWORD", "password") + DB_PORT = os.getenv("DB_PORT", 6608) + DB_HOST = os.getenv("DB_HOST", "0.0.0.0") + DB_NAME = "ml_api_test" + SQLALCHEMY_DATABASE_URI = ( + f"postgresql+psycopg2://{DB_USER}:" + f"{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}" + ) + class ProductionConfig(Config): - pass + DB_USER = os.getenv("DB_USER", "user") + DB_PASSWORD = os.getenv("DB_PASSWORD", "password") + DB_PORT = os.getenv("DB_PORT", 6609) + DB_HOST = os.getenv("DB_HOST", "database") + DB_NAME = os.getenv("DB_NAME", "ml_api") + SQLALCHEMY_DATABASE_URI = ( + f"postgresql+psycopg2://{DB_USER}:" + f"{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}" + ) def get_console_handler(): diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py index 0f2763d..717872c 100644 --- a/packages/ml_api/api/controller.py +++ b/packages/ml_api/api/controller.py @@ -1,12 +1,15 @@ import json +import logging from flask import request, jsonify, Response, current_app -from gradient_boosting_model.predict import make_prediction -from regression_model.predict import make_prediction as make_regression_prediction +from gradient_boosting_model.predict import make_prediction from api.persistence.data_access import PredictionPersistence, ModelType +_logger = logging.getLogger(__name__) + + def health(): if request.method == "GET": return jsonify({"status": "ok"}) @@ -17,30 +20,29 @@ def predict(): # Step 1: Extract POST data from request body as JSON json_data = request.get_json() - # Step 2: Access the model prediction function (also validates data) - result = make_prediction(input_data=json_data) - - # Step 3: Handle errors - errors = result.get("errors") - if errors: - return Response(json.dumps(errors), status=400) - - # Step 4: Split out results - predictions = result.get("predictions").tolist() - version = result.get("version") - - # Step 5: Save predictions + # Step 2a: Get and save live model predictions persistence = PredictionPersistence(db_session=current_app.db_session) - persistence.save_predictions( - inputs=json_data, - model_version=version, - predictions=predictions, - db_model=ModelType.GRADIENT_BOOSTING, + result = persistence.make_save_predictions( + db_model=ModelType.LASSO, input_data=json_data ) - # Step 6: Prepare prediction response + # Step 2b: Get and save shadow predictions + shadow_result = persistence.make_save_predictions( # noqa + db_model=ModelType.GRADIENT_BOOSTING, input_data=json_data + ) + + # Step 3: Handle errors + if result.errors: + _logger.warning(f"errors during prediction: {result.errors}") + return Response(json.dumps(result.errors), status=400) + + # Step 4: Prepare prediction response return jsonify( - {"predictions": predictions, "version": version, "errors": errors} + { + "predictions": result.predictions, + "version": result.model_version, + "errors": result.errors, + } ) @@ -50,7 +52,7 @@ def predict_previous(): json_data = request.get_json() # Step 2: Access the model prediction function (also validates data) - result = make_regression_prediction(input_data=json_data) + result = make_prediction(input_data=json_data) # Step 3: Handle errors errors = result.get("errors") diff --git a/packages/ml_api/api/persistence/core.py b/packages/ml_api/api/persistence/core.py index ac3793a..4620c4a 100644 --- a/packages/ml_api/api/persistence/core.py +++ b/packages/ml_api/api/persistence/core.py @@ -1,12 +1,15 @@ import logging +import os +import alembic.config from flask import Flask from sqlalchemy import create_engine from sqlalchemy.engine import Engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import scoped_session, sessionmaker +from sqlalchemy_utils import database_exists, create_database -from api.config import Config +from api.config import Config, ROOT _logger = logging.getLogger(__name__) @@ -22,9 +25,12 @@ def create_db_engine_from_config(*, config: Config) -> Engine: a specific kind of database / DBAPI combination. """ - engine = create_engine(config.SQLALCHEMY_DATABASE_URI,) + db_url = config.SQLALCHEMY_DATABASE_URI + if not database_exists(db_url): + create_database(db_url) + engine = create_engine(db_url) - _logger.info(f"creating DB conn with URI: {config.SQLALCHEMY_DATABASE_URI}") + _logger.info(f"creating DB conn with URI: {db_url}") return engine @@ -51,3 +57,17 @@ def init_database(app: Flask, config: Config, db_session=None) -> None: @app.teardown_appcontext def shutdown_session(exception=None): db_session.remove() + + +def run_migrations(): + """Run the DB migrations prior to the tests.""" + + # alembic looks for the migrations in the current + # directory so we change to the correct directory. + os.chdir(str(ROOT)) + alembicArgs = [ + "--raiseerr", + "upgrade", + "head", + ] + alembic.config.main(argv=alembicArgs) diff --git a/packages/ml_api/api/persistence/data_access.py b/packages/ml_api/api/persistence/data_access.py index 46fb88b..fa8a838 100644 --- a/packages/ml_api/api/persistence/data_access.py +++ b/packages/ml_api/api/persistence/data_access.py @@ -1,7 +1,12 @@ import enum +import json import logging import typing as t +import numpy as np +import pandas as pd +from gradient_boosting_model.predict import make_prediction as make_shadow_prediction +from regression_model.predict import make_prediction as make_live_prediction from sqlalchemy.orm.session import Session from api.persistence.models import ( @@ -9,6 +14,12 @@ GradientBoostingModelPredictions, ) +SECONDARY_VARIABLES_TO_RENAME = { + "FirstFlrSF": "1stFlrSF", + "SecondFlrSF": "2ndFlrSF", + "ThreeSsnPortch": "3SsnPorch", +} + _logger = logging.getLogger(__name__) @@ -17,35 +28,83 @@ class ModelType(enum.Enum): GRADIENT_BOOSTING = "gradient_boosting" +class PredictionResult(t.NamedTuple): + errors: t.Any + predictions: np.array + model_version: str + + +MODEL_PREDICTION_MAP = { + ModelType.GRADIENT_BOOSTING: make_shadow_prediction, + ModelType.LASSO: make_live_prediction, +} + + class PredictionPersistence: - def __init__(self, *, db_session: Session, user_id: str = None) -> None: + def __init__(self, *, db_session: Session, user_id: str = None,) -> None: self.db_session = db_session if not user_id: # in reality, here we would use something like a UUID for anonymous users # and if we had user logins, we would record the user ID. self.user_id = "007" + def make_save_predictions( + self, *, db_model: ModelType, input_data: t.List + ) -> PredictionResult: + """Get the prediction from a given model and persist it.""" + # Access the model prediction function via mapping + if db_model == ModelType.LASSO: + # we have to rename a few of the columns for backwards + # compatibility with the regression model package. + live_frame = pd.DataFrame(input_data) + input_data = live_frame.rename( + columns=SECONDARY_VARIABLES_TO_RENAME + ).to_dict(orient="records") + + result = MODEL_PREDICTION_MAP[db_model](input_data=input_data) + errors = None + try: + errors = result["errors"] + except KeyError: + # regression model `make_prediction` does not include errors + pass + + prediction_result = PredictionResult( + errors=errors, + predictions=result.get("predictions").tolist() if not errors else None, + model_version=result.get("version"), + ) + + if prediction_result.errors: + return prediction_result + + self.save_predictions( + inputs=input_data, prediction_result=prediction_result, db_model=db_model, + ) + + return prediction_result + def save_predictions( self, *, inputs: t.List, - model_version: str, - predictions: t.List, + prediction_result: PredictionResult, db_model: ModelType, ) -> None: + """Persist model predictions to storage.""" if db_model == db_model.LASSO: prediction_data = LassoModelPredictions( user_id=self.user_id, - model_version=model_version, - inputs=inputs, - outputs=predictions, + model_version=prediction_result.model_version, + inputs=json.dumps(inputs), + outputs=json.dumps(prediction_result.predictions), ) else: prediction_data = GradientBoostingModelPredictions( user_id=self.user_id, - model_version=model_version, - inputs=inputs, - outputs=predictions, + model_version=prediction_result.model_version, + inputs=json.dumps(inputs), + outputs=json.dumps(prediction_result.predictions), ) self.db_session.add(prediction_data) diff --git a/packages/ml_api/api/spec/api.yaml b/packages/ml_api/api/spec/api.yaml index 0b95033..84c3075 100644 --- a/packages/ml_api/api/spec/api.yaml +++ b/packages/ml_api/api/spec/api.yaml @@ -19,7 +19,7 @@ paths: '200': description: API Health Status - /v1/predictions/gradient: + /v1/predictions/regression: post: operationId: api.controller.predict requestBody: @@ -39,7 +39,7 @@ paths: '5XX': description: Unexpected error - /v1/predictions/regression: + /v1/predictions/gradient: post: operationId: api.controller.predict_previous requestBody: diff --git a/packages/ml_api/docker/Dockerfile b/packages/ml_api/docker/Dockerfile index 9c8f6c7..9c948fc 100644 --- a/packages/ml_api/docker/Dockerfile +++ b/packages/ml_api/docker/Dockerfile @@ -15,9 +15,6 @@ RUN apt-get update -y && \ RUN pip install -r /opt/app/requirements/requirements.txt ENV PYTHONPATH "${PYTHONPATH}:/opt/app/" -COPY Makefile /opt/app/Makefile -COPY api /opt/app/api -COPY run.py /opt/app/run.py -COPY alembic.ini /opt/app/alembic.ini -COPY alembic /opt/app/alembic + +ADD . /opt/app WORKDIR /opt/app diff --git a/packages/ml_api/docker/Dockerfile.test b/packages/ml_api/docker/Dockerfile.test index 77da899..46c29ac 100644 --- a/packages/ml_api/docker/Dockerfile.test +++ b/packages/ml_api/docker/Dockerfile.test @@ -13,9 +13,6 @@ RUN apt-get update -y && \ ENV PYTHONPATH "${PYTHONPATH}:/opt/app" RUN pip install -r /opt/app/requirements/test_requirements.txt -COPY Makefile /opt/app/Makefile -COPY api /opt/app/api -COPY run.py /opt/app/run.py -COPY differential_tests /opt/app/differential_tests -COPY tests /opt/app/tests + +ADD . /opt/app WORKDIR /opt/app diff --git a/packages/ml_api/docker/docker-compose.test.yml b/packages/ml_api/docker/docker-compose.test.yml new file mode 100644 index 0000000..44109b6 --- /dev/null +++ b/packages/ml_api/docker/docker-compose.test.yml @@ -0,0 +1,33 @@ +version: '3' +services: + ml_api_test: + image: christophergs/ml_api:master + build: + context: ../ + dockerfile: docker/Dockerfile.test + environment: + DB_HOST: test_database + DB_PORT: 5432 + DB_USER: test_user + DB_PASSWORD: ${DB_PASSWORD:-password} + DB_NAME: ml_api_test + depends_on: + - test_database + ports: + - "5000:5000" # expose webserver to localhost host:container + command: bash -c "make db-migrations && make run-service-development" + + test_database: + image: postgres:latest + environment: + POSTGRES_USER: test_user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_test + ports: + # expose postgres container on different host port to default (host:container) + - "6608:5432" + volumes: + - my_dbdata_test:/var/lib/postgresql/test_data + +volumes: + my_dbdata_test: diff --git a/packages/ml_api/docker/docker-compose.yml b/packages/ml_api/docker/docker-compose.yml index 399c022..431588c 100644 --- a/packages/ml_api/docker/docker-compose.yml +++ b/packages/ml_api/docker/docker-compose.yml @@ -9,7 +9,7 @@ services: DB_PORT: 5432 DB_USER: user DB_PASSWORD: ${DB_PASSWORD:-password} - DB_NAME: ml_api + DB_NAME: ml_api_dev depends_on: - database ports: @@ -21,8 +21,9 @@ services: environment: POSTGRES_USER: user POSTGRES_PASSWORD: password - POSTGRES_DB: ml_api + POSTGRES_DB: ml_api_dev ports: + # expose postgres container on different host port to default (host:container) - "6609:5432" volumes: - my_dbdata:/var/lib/postgresql/data diff --git a/packages/ml_api/requirements/requirements.txt b/packages/ml_api/requirements/requirements.txt index 6d80ed5..ee8837f 100644 --- a/packages/ml_api/requirements/requirements.txt +++ b/packages/ml_api/requirements/requirements.txt @@ -14,7 +14,7 @@ flake8>=3.7.9,<4.0 mypy>=0.740 # Persistence - sqlalchemy>=1.3.11,<1.4.0 # ORM psycopg2>=2.8.4,<2.9.0 # DB Driver alembic>=1.3.1,<1.4.0 # DB Migrations +sqlalchemy_utils>=0.36.0,<0.37.0 # DB Utils diff --git a/packages/ml_api/tests/conftest.py b/packages/ml_api/tests/conftest.py index c42f56a..8939e04 100644 --- a/packages/ml_api/tests/conftest.py +++ b/packages/ml_api/tests/conftest.py @@ -1,12 +1,41 @@ +import os + +from unittest import mock import pytest +from gradient_boosting_model.processing.data_management import load_dataset +from sqlalchemy_utils import create_database, database_exists -from api.config import TestingConfig from api.app import create_app +from api.config import TestingConfig +from api.persistence import core + + +@pytest.fixture(scope='session') +def _db(): + db_url = TestingConfig.SQLALCHEMY_DATABASE_URI + if not database_exists(db_url): + create_database(db_url) + # alembic can be configured through the configuration file. For testing + # purposes 'env.py' also checks the 'ALEMBIC_DB_URI' variable first. + engine = core.create_db_engine_from_config(config=TestingConfig()) + evars = {"ALEMBIC_DB_URI": db_url} + with mock.patch.dict(os.environ, evars): + core.run_migrations() + + yield engine @pytest.fixture(scope='session') -def app(): - app = create_app(config_object=TestingConfig()).app +def _db_session(_db): + """ Create DB session for testing. + """ + session = core.create_db_session(engine=_db) + yield session + + +@pytest.fixture(scope='session') +def app(_db_session): + app = create_app(config_object=TestingConfig(), db_session=_db_session).app with app.app_context(): yield app @@ -15,3 +44,11 @@ def app(): def client(app): with app.test_client() as client: yield client # Has to be yielded to access session cookies + + +@pytest.fixture +def test_inputs_df(): + # Load the gradient boosting test dataset which + # is included in the model package + test_inputs_df = load_dataset(file_name="test.csv") + return test_inputs_df.copy(deep=True) diff --git a/packages/ml_api/tests/test_api.py b/packages/ml_api/tests/test_api.py index 91ff466..6c7a7d0 100644 --- a/packages/ml_api/tests/test_api.py +++ b/packages/ml_api/tests/test_api.py @@ -2,14 +2,13 @@ import numpy as np import pytest -from gradient_boosting_model.processing.data_management import load_dataset - -SECONDARY_VARIABLES_TO_RENAME = { - "FirstFlrSF": "1stFlrSF", - "SecondFlrSF": "2ndFlrSF", - "ThreeSsnPortch": "3SsnPorch", -} +from api.persistence.data_access import SECONDARY_VARIABLES_TO_RENAME +from api.persistence.models import ( + GradientBoostingModelPredictions, + LassoModelPredictions, +) +from gradient_boosting_model.processing.data_management import load_dataset @pytest.mark.integration @@ -27,19 +26,21 @@ def test_health_endpoint(client): "api_endpoint, expected_no_predictions", ( ( - "v1/predictions/gradient", + "v1/predictions/regression", # test csv contains 1459 rows # we expect 2 rows to be filtered - 1457, + 1451, ), ( - "v1/predictions/regression", + "v1/predictions/gradient", # we expect 8 rows to be filtered - 1451, + 1457, ), ), ) -def test_prediction_endpoint(api_endpoint, expected_no_predictions, client): +def test_prediction_endpoint( + api_endpoint, expected_no_predictions, client, test_inputs_df +): # Given # Load the test dataset which is included in the model package test_inputs_df = load_dataset(file_name="test.csv") # dataframe @@ -87,11 +88,10 @@ def test_prediction_endpoint(api_endpoint, expected_no_predictions, client): ), ) @pytest.mark.integration -def test_prediction_validation(field, field_value, index, expected_error, client): +def test_prediction_validation( + field, field_value, index, expected_error, client, test_inputs_df +): # Given - # Load the test dataset which is included in the model package - test_inputs_df = load_dataset(file_name="test.csv") # dataframe - # Check gradient_boosting_model.processing.validation import HouseDataInputSchema # and you will see the expected values for the inputs to the house price prediction # model. In this test, inputs are changed to incorrect values to check the validation. @@ -106,3 +106,25 @@ def test_prediction_validation(field, field_value, index, expected_error, client assert response.status_code == 400 data = json.loads(response.data) assert data == expected_error + + +@pytest.mark.integration +def test_prediction_data_saved(client, app, test_inputs_df): + # Given + gradient_record_count = app.db_session.query( + GradientBoostingModelPredictions + ).count() + lasso_record_count = app.db_session.query(LassoModelPredictions).count() + + # When + response = client.post( + "/v1/predictions/regression", json=test_inputs_df.to_dict(orient="records") + ) + + # Then + assert response.status_code == 200 + assert ( + app.db_session.query(GradientBoostingModelPredictions).count() + == gradient_record_count + 1 + ) + assert app.db_session.query(LassoModelPredictions).count() == lasso_record_count + 1 diff --git a/packages/ml_api/tests/test_back_to_back_models.py b/packages/ml_api/tests/test_back_to_back_models.py index 9533438..af98797 100644 --- a/packages/ml_api/tests/test_back_to_back_models.py +++ b/packages/ml_api/tests/test_back_to_back_models.py @@ -3,7 +3,7 @@ import pytest from gradient_boosting_model.processing.data_management import load_dataset -from tests.test_api import SECONDARY_VARIABLES_TO_RENAME +from api.persistence.data_access import SECONDARY_VARIABLES_TO_RENAME from differential_tests.compare import compare_differences diff --git a/packages/ml_api/tests/test_persistence.py b/packages/ml_api/tests/test_persistence.py new file mode 100644 index 0000000..8172157 --- /dev/null +++ b/packages/ml_api/tests/test_persistence.py @@ -0,0 +1,36 @@ +from unittest import mock +import pytest + +from api.persistence.data_access import PredictionPersistence, ModelType + +from api.persistence.models import ( + GradientBoostingModelPredictions, + LassoModelPredictions, +) + + +# parameterizationa allows us to try many combinations of data +# within the same test, see the pytest docs for details: +# https://docs.pytest.org/en/latest/parametrize.html +@pytest.mark.parametrize( + "model_type, model,", + ( + (ModelType.GRADIENT_BOOSTING, GradientBoostingModelPredictions), + (ModelType.LASSO, LassoModelPredictions), + ), +) +def test_data_access(model_type, model, test_inputs_df): + # Given + # We mock the database session + mock_session = mock.MagicMock() + _persistence = PredictionPersistence(db_session=mock_session) + + # When + _persistence.make_save_predictions( + db_model=model_type, input_data=test_inputs_df.to_dict(orient="records") + ) + + # Then + assert mock_session.commit.call_count == 1 + assert mock_session.add.call_count == 1 + assert isinstance(mock_session.add.call_args[0][0], model) diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini index 674f073..30b2421 100644 --- a/packages/ml_api/tox.ini +++ b/packages/ml_api/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = integration_tests,differential_tests,typechecks,stylechecks +envlist = integration_tests,unit_tests,differential_tests,typechecks,stylechecks skipsdist = True @@ -28,10 +28,10 @@ passenv = setenv = PYTHONPATH=. - DB_USER={env:DB_USER:user} + DB_USER={env:DB_USER:test_user} DB_PASSWORD={env:DB_PASSWORD:password} DB_HOST={env:DB_HOST:localhost} - DB_NAME={env:DB_NAME:ml_api} + DB_NAME={env:DB_NAME:ml_api_test} commands = pytest \ @@ -41,6 +41,25 @@ commands = {posargs:tests/} +[testenv:unit_tests] +envdir = {toxworkdir}/integration_tests +deps = + {[testenv]deps} + +passenv = + {[testenv]passenv} + +setenv = + PYTHONPATH=. + +commands = + pytest \ + -s \ + -vv \ + -m "not integration and not differential" \ + {posargs:tests/} + + [testenv:differential_tests] envdir = {toxworkdir}/integration_tests deps = @@ -51,10 +70,10 @@ passenv = setenv = PYTHONPATH=. - DB_USER={env:DB_USER:user} + DB_USER={env:DB_USER:test_user} DB_PASSWORD={env:DB_PASSWORD:password} DB_HOST={env:DB_HOST:localhost} - DB_NAME={env:DB_NAME:ml_api} + DB_NAME={env:DB_NAME:ml_api_test} commands = pytest \ From fb83a3eac775aba299eb15d1f929698757541649 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 21 Dec 2019 10:26:52 +0000 Subject: [PATCH 12/26] Shadow Mode ML Code - Asynchronous Implementation --- packages/ml_api/api/config.py | 1 + packages/ml_api/api/controller.py | 19 +++++++++++++---- packages/ml_api/api/persistence/core.py | 10 ++------- .../ml_api/api/persistence/data_access.py | 4 ++-- packages/ml_api/tests/test_api.py | 21 ++++++++++++------- packages/ml_api/tox.ini | 5 +++++ 6 files changed, 38 insertions(+), 22 deletions(-) diff --git a/packages/ml_api/api/config.py b/packages/ml_api/api/config.py index 05ae140..03b74be 100644 --- a/packages/ml_api/api/config.py +++ b/packages/ml_api/api/config.py @@ -22,6 +22,7 @@ class Config: SERVER_PORT = int(os.getenv("SERVER_PORT", 5000)) SERVER_HOST = os.getenv("SERVER_HOST", "0.0.0.0") LOGGING_LEVEL = os.getenv("LOGGING_LEVEL", logging.INFO) + SHADOW_MODE_ACTIVE = os.getenv('SHADOW_MODE_ACTIVE', True) SQLALCHEMY_DATABASE_URI = ( f"postgresql+psycopg2://{os.getenv('DB_USER')}:" f"{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}/{os.getenv('DB_NAME')}" diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py index 717872c..8451e9a 100644 --- a/packages/ml_api/api/controller.py +++ b/packages/ml_api/api/controller.py @@ -1,5 +1,6 @@ import json import logging +import threading from flask import request, jsonify, Response, current_app @@ -26,10 +27,20 @@ def predict(): db_model=ModelType.LASSO, input_data=json_data ) - # Step 2b: Get and save shadow predictions - shadow_result = persistence.make_save_predictions( # noqa - db_model=ModelType.GRADIENT_BOOSTING, input_data=json_data - ) + # Step 2b: Get and save shadow predictions asynchronously + if current_app.config.get("SHADOW_MODE_ACTIVE"): + _logger.debug( + f"Calling shadow model asynchronously: " + f"{ModelType.GRADIENT_BOOSTING.value}" + ) + thread = threading.Thread( + target=persistence.make_save_predictions, + kwargs={ + "db_model": ModelType.GRADIENT_BOOSTING, + "input_data": json_data, + }, + ) + thread.start() # Step 3: Handle errors if result.errors: diff --git a/packages/ml_api/api/persistence/core.py b/packages/ml_api/api/persistence/core.py index 4620c4a..7fd231a 100644 --- a/packages/ml_api/api/persistence/core.py +++ b/packages/ml_api/api/persistence/core.py @@ -40,9 +40,7 @@ def create_db_session(*, engine: Engine) -> scoped_session: It represents a “holding zone” for all the objects which you’ve loaded or associated with it during its lifespan. """ - return scoped_session( - sessionmaker(autocommit=False, autoflush=False, bind=engine,), - ) + return scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine)) def init_database(app: Flask, config: Config, db_session=None) -> None: @@ -65,9 +63,5 @@ def run_migrations(): # alembic looks for the migrations in the current # directory so we change to the correct directory. os.chdir(str(ROOT)) - alembicArgs = [ - "--raiseerr", - "upgrade", - "head", - ] + alembicArgs = ["--raiseerr", "upgrade", "head"] alembic.config.main(argv=alembicArgs) diff --git a/packages/ml_api/api/persistence/data_access.py b/packages/ml_api/api/persistence/data_access.py index fa8a838..4feb247 100644 --- a/packages/ml_api/api/persistence/data_access.py +++ b/packages/ml_api/api/persistence/data_access.py @@ -41,7 +41,7 @@ class PredictionResult(t.NamedTuple): class PredictionPersistence: - def __init__(self, *, db_session: Session, user_id: str = None,) -> None: + def __init__(self, *, db_session: Session, user_id: str = None) -> None: self.db_session = db_session if not user_id: # in reality, here we would use something like a UUID for anonymous users @@ -79,7 +79,7 @@ def make_save_predictions( return prediction_result self.save_predictions( - inputs=input_data, prediction_result=prediction_result, db_model=db_model, + inputs=input_data, prediction_result=prediction_result, db_model=db_model ) return prediction_result diff --git a/packages/ml_api/tests/test_api.py b/packages/ml_api/tests/test_api.py index 6c7a7d0..3d9fb5b 100644 --- a/packages/ml_api/tests/test_api.py +++ b/packages/ml_api/tests/test_api.py @@ -1,4 +1,5 @@ import json +import time import numpy as np import pytest @@ -49,9 +50,7 @@ def test_prediction_endpoint( test_inputs_df.rename(columns=SECONDARY_VARIABLES_TO_RENAME, inplace=True) # When - response = client.post( - api_endpoint, json=test_inputs_df.to_dict(orient="records") - ) + response = client.post(api_endpoint, json=test_inputs_df.to_dict(orient="records")) # Then assert response.status_code == 200 @@ -84,7 +83,7 @@ def test_prediction_endpoint( 34, {"34": {"CentralAir": ["Field may not be null."]}}, ), - ("LotArea", "", 2, {"2": {"LotArea": ["Not a valid integer."]}},), + ("LotArea", "", 2, {"2": {"LotArea": ["Not a valid integer."]}}), ), ) @pytest.mark.integration @@ -111,10 +110,10 @@ def test_prediction_validation( @pytest.mark.integration def test_prediction_data_saved(client, app, test_inputs_df): # Given - gradient_record_count = app.db_session.query( + initial_gradient_count = app.db_session.query( GradientBoostingModelPredictions ).count() - lasso_record_count = app.db_session.query(LassoModelPredictions).count() + initial_lasso_count = app.db_session.query(LassoModelPredictions).count() # When response = client.post( @@ -123,8 +122,14 @@ def test_prediction_data_saved(client, app, test_inputs_df): # Then assert response.status_code == 200 + assert ( + app.db_session.query(LassoModelPredictions).count() == initial_lasso_count + 1 + ) + + # The gradient prediction save occurs on a separate async thread which can take + # time to complete. We pause the test briefly to allow the save operation to finish. + time.sleep(2) assert ( app.db_session.query(GradientBoostingModelPredictions).count() - == gradient_record_count + 1 + == initial_gradient_count + 1 ) - assert app.db_session.query(LassoModelPredictions).count() == lasso_record_count + 1 diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini index 30b2421..27b9439 100644 --- a/packages/ml_api/tox.ini +++ b/packages/ml_api/tox.ini @@ -13,6 +13,7 @@ passenv = # A list of wildcard environment variable names which shall be copied from # the tox invocation environment to the test environment when executing test commands DB_* + SHADOW_MODE_ACTIVE commands= py.test @@ -31,7 +32,9 @@ setenv = DB_USER={env:DB_USER:test_user} DB_PASSWORD={env:DB_PASSWORD:password} DB_HOST={env:DB_HOST:localhost} + DB_PORT={env:DB_PORT:6608} DB_NAME={env:DB_NAME:ml_api_test} + SHADOW_MODE_ACTIVE={env:SHADOW_MODE_ACTIVE:true} commands = pytest \ @@ -73,7 +76,9 @@ setenv = DB_USER={env:DB_USER:test_user} DB_PASSWORD={env:DB_PASSWORD:password} DB_HOST={env:DB_HOST:localhost} + DB_PORT={env:DB_PORT:6608} DB_NAME={env:DB_NAME:ml_api_test} + SHADOW_MODE_ACTIVE={env:SHADOW_MODE_ACTIVE:true} commands = pytest \ From 46f8fa46d58667bbb9abcfa3a67355337479ad42 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 28 Dec 2019 16:26:22 +0000 Subject: [PATCH 13/26] Shadow Mode ML Code - Populate DB Script --- .../ml_api/requirements/test_requirements.txt | 1 + packages/ml_api/scripts/populate_database.py | 92 +++++++++++++++++++ packages/ml_api/tox.ini | 15 +++ 3 files changed, 108 insertions(+) create mode 100644 packages/ml_api/scripts/populate_database.py diff --git a/packages/ml_api/requirements/test_requirements.txt b/packages/ml_api/requirements/test_requirements.txt index 2a0a147..c909f64 100644 --- a/packages/ml_api/requirements/test_requirements.txt +++ b/packages/ml_api/requirements/test_requirements.txt @@ -2,6 +2,7 @@ # testing requirements pytest>=5.3.2,<6.0.0 +requests>=2.22.0,<2.23.0 # repo maintenance tooling black>=19.10b0,<20.0 diff --git a/packages/ml_api/scripts/populate_database.py b/packages/ml_api/scripts/populate_database.py new file mode 100644 index 0000000..717c4e2 --- /dev/null +++ b/packages/ml_api/scripts/populate_database.py @@ -0,0 +1,92 @@ +from gradient_boosting_model.processing.data_management import load_dataset +from gradient_boosting_model.config.core import config +import requests +import pandas as pd + +from random import randint +from itertools import islice +import json +import os +import typing as t +import time + + +LOCAL_URL = f'http://{os.getenv("DB_HOST", "localhost")}:5000' + +HEADERS = {"Accept": "application/json", "Content-Type": "application/json"} + +LOT_AREA_MAP = {"min": 1470, "max": 56600} + +FIRST_FLR_SF_MAP = {"min": 407, "max": 5095} + +SECOND_FLR_SF_MAP = {"min": 0, "max": 1862} + + +def _generate_random_int(value: int, value_ranges: t.Mapping) -> int: + """Generate random integer within a min and max range.""" + random_value = randint(value_ranges["min"], value_ranges["max"]) + + return int(random_value) + + +def _prepare_inputs(dataframe: pd.DataFrame) -> pd.DataFrame: + """Prepare input data by removing key rows with NA values.""" + clean_inputs_df = dataframe.dropna( + subset=config.model_config.features + ["KitchenQual", "LotFrontage"] + ).copy() + + clean_inputs_df.loc[:, "FirstFlrSF"] = clean_inputs_df["FirstFlrSF"].apply( + _generate_random_int, value_ranges=FIRST_FLR_SF_MAP + ) + clean_inputs_df.loc[:, "SecondFlrSF"] = clean_inputs_df["SecondFlrSF"].apply( + _generate_random_int, value_ranges=SECOND_FLR_SF_MAP + ) + clean_inputs_df.loc[:, "LotArea"] = clean_inputs_df["LotArea"].apply( + _generate_random_int, value_ranges=LOT_AREA_MAP + ) + + return clean_inputs_df + + +def populate_database(n_predictions: int = 500) -> None: + """ + Manipulate the test data to generate random + predictions and save them to the database. + Before running this script, ensure that the + API and Database docker containers are running. + """ + + print(f"Preparing to generate: {n_predictions} predictions.") + + # Load the gradient boosting test dataset which + # is included in the model package + test_inputs_df = load_dataset(file_name="test.csv") + clean_inputs_df = _prepare_inputs(dataframe=test_inputs_df) + if len(clean_inputs_df) < n_predictions: + print( + f"If you want {n_predictions} predictions, you need to" + "extend the script to handle more predictions." + ) + + for index, data in clean_inputs_df.iterrows(): + if index > n_predictions: + break + + response = requests.post( + f"{LOCAL_URL}/v1/predictions/regression", + headers=HEADERS, + json=[data.to_dict()], + ) + response.raise_for_status() + + if index % 50 == 0: + print(f"{index} predictions complete") + + # prevent overloading the server + time.sleep(0.5) + + print("Prediction generation complete.") + + +if __name__ == "__main__": + populate_database(n_predictions=500) diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini index 27b9439..3b499c7 100644 --- a/packages/ml_api/tox.ini +++ b/packages/ml_api/tox.ini @@ -88,6 +88,21 @@ commands = {posargs:tests/} +[testenv:generate_predictions] +envdir = {toxworkdir}/generate_predictions +deps = + {[testenv]deps} + +passenv = + {[testenv]passenv} + +setenv = + PYTHONPATH=. + DB_HOST={env:DB_HOST:localhost} + +commands = python scripts/populate_database.py + + [testenv:typechecks] envdir = {toxworkdir}/integration_tests From c0f42d5e24988a5d50c7e846ed9ff249284aa90f Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 4 Jan 2020 14:54:25 +0000 Subject: [PATCH 14/26] Shadow Mode ML Code - Analyse Results --- .dockerignore | 18 + .../assessing_model_results.ipynb | 1740 +++++++++++++++++ .../shadow_mode_exercise/requirements.txt | 15 + 3 files changed, 1773 insertions(+) create mode 100644 .dockerignore create mode 100755 exercise_notebooks/shadow_mode_exercise/assessing_model_results.ipynb create mode 100644 exercise_notebooks/shadow_mode_exercise/requirements.txt diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..c853bc5 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,18 @@ +exercise_notebooks/* +*/env* +*/venv* +.circleci* +packages/gradient_boosting_model +*.env +*.log +.git +.gitignore +.dockerignore +*.mypy_cache +*.pytest_cache + +### Python ### + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] diff --git a/exercise_notebooks/shadow_mode_exercise/assessing_model_results.ipynb b/exercise_notebooks/shadow_mode_exercise/assessing_model_results.ipynb new file mode 100755 index 0000000..1733a2d --- /dev/null +++ b/exercise_notebooks/shadow_mode_exercise/assessing_model_results.ipynb @@ -0,0 +1,1740 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup\n", + "\n", + "1. Checkout the code at commit: **\"Shadow Mode ML Code - Analyse Results\"**\n", + "\n", + "\n", + "2. Make sure your virtualenv is active, and you have installed the dependencies listed in the requirements.txt located in the same directory as this notebook.\n", + "\n", + "\n", + "3. Make sure you have copied the Kaggle houseprice.csv into the same directory as this notebook. By this point in the course, you should have this csv file here:\n", + "\n", + "testing-and-monitoring-ml-deployments/packages/gradient_boosting_model/gradient_boosting_model/datasets/\n", + "\n", + "\n", + "4. Before running any cells in the notebook, start your docker containers: Open the terminal/Command prompt and navigate to this directory:\n", + "\n", + "testing-and-monitoring-ml-deployments/packages/ml_api\n", + "\n", + "5. Then run: `docker-compose -f docker/docker-compose.yml up -d --build`\n", + "\n", + "(Old window version users, remember to run: `docker-machine start default` followed by `docker-machine env` before the docker compose command)\n", + "\n", + "6. Populate the DB with simulated shadow data by running\n", + "`tox -e generate_predictions` (also from the ml_api directory)\n", + "\n", + "Note: Feel free to run the populate DB command multiple times, however if you generate more than 10k requests then some of the tests below will fail.\n", + "\n", + "**Keep in mind that the populate_database.py script has some element of randomness, so based on that and on that you and I may run this script a different amount of times, the results shown in this notebook are indicative, and may be identical**\n", + "\n", + "Focus instead on the take-home messages derived from the tests." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.stats as stats\n", + "import seaborn as sns\n", + "from sqlalchemy import create_engine\n", + "\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A reminder that SQLAlchemy DB URIs look like this:\n", + "`postgres+psycop2://myuser:mypassword@hackersdb.example.com:5432/mydatabase`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "postgres+psycopg2://user:password@localhost:6609/ml_api_dev\n" + ] + } + ], + "source": [ + "# note that this connection string can be found in the app DevelopmentConfig.\n", + "\n", + "# to save hassle with updating the PATH so we can import the config object,\n", + "# we write it out in full here:\n", + "\n", + "db_uri = \"postgres+psycopg2://user:password@localhost:6609/ml_api_dev\"\n", + "print(db_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# connect to the database\n", + "\n", + "engine = create_engine(db_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's load our live data, that is, the predictions we generated and are stored\n", + "# in our database (takes about 30 seconds to run)\n", + "\n", + "sql_df = pd.read_sql_table(\"gradient_boosting_model_predictions\", con=engine)\n", + "\n", + "# munge json array of inputs from postgres jsonb field.\n", + "inputs_df = sql_df.inputs.apply(\n", + " lambda row: pd.DataFrame(json.loads(row))).tolist()\n", + "\n", + "live_data = pd.concat(inputs_df, sort=False)\n", + "outputs_df = sql_df.outputs.apply(lambda row: pd.Series(json.loads(row)))\n", + "live_data['SalePrice'] = outputs_df.values\n", + "live_data.reset_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# rename live columns to match training data file\n", + "\n", + "SECONDARY_VARIABLES_TO_RENAME = {\n", + " \"FirstFlrSF\": \"1stFlrSF\",\n", + " \"SecondFlrSF\": \"2ndFlrSF\",\n", + " \"ThreeSsnPortch\": \"3SsnPorch\",\n", + "}\n", + "live_data.rename(columns=SECONDARY_VARIABLES_TO_RENAME, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# columns ==> inputs to the new model\n", + "\n", + "model_features = ['LotArea', 'OverallQual', 'YearRemodAdd',\n", + " 'BsmtQual', 'BsmtFinSF1','TotalBsmtSF',\n", + " '1stFlrSF', '2ndFlrSF', 'GrLivArea',\n", + " 'GarageCars', 'YrSold']\n", + "\n", + "# From the live data, we select only those variables that\n", + "# are actually used in the model\n", + "# and the predictions (SalePrice)\n", + "\n", + "live_data = live_data[model_features + ['SalePrice']]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LotAreaOverallQualYearRemodAddBsmtQualBsmtFinSF1TotalBsmtSF1stFlrSF2ndFlrSFGrLivAreaGarageCarsYrSoldSalePrice
1228378151969TA280.0720.0212562713082.02008124359.797157
12291529961976TA0.01444.04860120414442.02008157332.471992
12302205051974Gd0.0896.02384174817922.02008167039.397270
1231805241969Gd252.0936.0459511639361.02008121015.582065
12322850351971Gd119.0864.014033698642.02008145224.114862
\n", + "
" + ], + "text/plain": [ + " LotArea OverallQual YearRemodAdd BsmtQual BsmtFinSF1 TotalBsmtSF \\\n", + "1228 3781 5 1969 TA 280.0 720.0 \n", + "1229 15299 6 1976 TA 0.0 1444.0 \n", + "1230 22050 5 1974 Gd 0.0 896.0 \n", + "1231 8052 4 1969 Gd 252.0 936.0 \n", + "1232 28503 5 1971 Gd 119.0 864.0 \n", + "\n", + " 1stFlrSF 2ndFlrSF GrLivArea GarageCars YrSold SalePrice \n", + "1228 2125 627 1308 2.0 2008 124359.797157 \n", + "1229 4860 1204 1444 2.0 2008 157332.471992 \n", + "1230 2384 1748 1792 2.0 2008 167039.397270 \n", + "1231 4595 1163 936 1.0 2008 121015.582065 \n", + "1232 1403 369 864 2.0 2008 145224.114862 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "live_data.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LotAreaOverallQualYearRemodAddBsmtQualBsmtFinSF1TotalBsmtSF1stFlrSF2ndFlrSFGrLivAreaGarageCarsYrSoldSalePrice
0845072003Gd706856856854171022008208500
1960061976Gd978126212620126222007181500
21125072002Gd486920920866178622008223500
3955071970TA216756961756171732006140000
41426082000Gd655114511451053219832008250000
\n", + "
" + ], + "text/plain": [ + " LotArea OverallQual YearRemodAdd BsmtQual BsmtFinSF1 TotalBsmtSF \\\n", + "0 8450 7 2003 Gd 706 856 \n", + "1 9600 6 1976 Gd 978 1262 \n", + "2 11250 7 2002 Gd 486 920 \n", + "3 9550 7 1970 TA 216 756 \n", + "4 14260 8 2000 Gd 655 1145 \n", + "\n", + " 1stFlrSF 2ndFlrSF GrLivArea GarageCars YrSold SalePrice \n", + "0 856 854 1710 2 2008 208500 \n", + "1 1262 0 1262 2 2007 181500 \n", + "2 920 866 1786 2 2008 223500 \n", + "3 961 756 1717 3 2006 140000 \n", + "4 1145 1053 2198 3 2008 250000 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now let's load the data we used to train the model\n", + "\n", + "# remember to copy the houseprice.csv data to the directory of this Jupyter notebook\n", + "# or alternatively change the path below to find the file\n", + "\n", + "\n", + "# load needed columns + target\n", + "train_data = pd.read_csv('houseprice.csv',\n", + " usecols=model_features + ['SalePrice'])\n", + "\n", + "\n", + "train_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((1460, 12), (1233, 12))" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's compare the shapes of both live and training data:\n", + "\n", + "train_data.shape, live_data.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Take this outputs as a demo of what to expect, and don't worry too much if the results are not identical. The more times you run the populate_database.py script, the larger the live_data will be." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Checks\n", + "\n", + "### Input checks - categorical variables\n", + "\n", + "We only have one categorical variable among our features:\n", + "\n", + "BsmtQual (Categorical): Evaluates the height of the basement\n", + "\n", + " Ex\tExcellent (100+ inches)\t\n", + " Gd\tGood (90-99 inches)\n", + " TA\tTypical (80-89 inches)\n", + " Fa\tFair (70-79 inches)\n", + " Po\tPoor (<70 inches)\n", + " NA\tNo Basement\n", + " \n", + "These are the values allowed according to how the variable was defined, and as we can see it can also take missing values.\n", + "\n", + "\n", + "You can find more details about the variable definitions and their permitted values here:\n", + "\n", + "[Source](http://bee-fore.s3-eu-west-1.amazonaws.com/datasets/62.txt)\n", + "\n", + "The first test aims to corroborate that live data takes only the permitted values. We can do as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Gd', 'TA', 'Ex', nan, 'Fa'], dtype=object)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's evaluate the unique values in our training data\n", + "\n", + "train_data['BsmtQual'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['TA', 'Gd', 'Ex', 'Fa'], dtype=object)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's now evaluate the unique values in our live data\n", + "\n", + "live_data['BsmtQual'].unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that both our training and live data take only the permitted values. We also see that the category Po is not present in either of the data sets, which is curious. Probably there were not that many basements in Poor conditions.\n", + "\n", + "If we wanted, we could write a short test as follows, and any number bigger than 0 would fail:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len([x for x in live_data['BsmtQual'] if x not in ['Gd', 'TA', 'Ex', np.nan, 'Fa']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Value range checks - Numerical variables\n", + "\n", + "We have a numerical and discrete variable in our data set that can only take 1 of the permitted values:\n", + "\n", + "OverallQual (Ordinal): Rates the overall material and finish of the house\n", + "\n", + " 10\tVery Excellent\n", + " 9\tExcellent\n", + " 8\tVery Good\n", + " 7\tGood\n", + " 6\tAbove Average\n", + " 5\tAverage\n", + " 4\tBelow Average\n", + " 3\tFair\n", + " 2\tPoor\n", + " 1\tVery Poor\n", + " \n", + "Given that the number of different permitted values is small, we could do an input check as we did with BsmtQual, or, for the sake of the demo, we can check value ranges:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "min 1\n", + "max 10\n", + "Name: OverallQual, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we know that the min and max values are 1 and 10 according to the\n", + "# variable definition, we can indeed check that as follows:\n", + "\n", + "train_data['OverallQual'].agg(['min', 'max'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "min 2\n", + "max 10\n", + "Name: OverallQual, dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's repeat for the live variable \n", + "\n", + "live_data['OverallQual'].agg(['min', 'max'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The values are within the range, so we have no reason to worry.\n", + "\n", + "We could write a small test as follows and any return bigger than 0 should fail:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len([x for x in live_data['OverallQual'] if x >10 or x <1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Excellent, we see that the variables in both our train and live data take values within the permitted range.\n", + "\n", + "**Note** we could also check value ranges for the remaining of the numerical variables, we just need to be confident that the variables can take only values within the range we want to evaluate.\n", + "\n", + "\n", + "### Missing value checks\n", + "\n", + "We know from our research phase, that these variables should not take missing data:\n", + "\n", + "numerical_na_not_allowed:\n", + " - LotArea\n", + " - OverallQual\n", + " - YearRemodAdd\n", + " - BsmtFinSF1\n", + " - TotalBsmtSF\n", + " - FirstFlrSF\n", + " - SecondFlrSF\n", + " - GrLivArea\n", + " - GarageCars\n", + " - YrSold" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# let's capture the above in a list\n", + "\n", + "numerical_na_not_allowed = ['LotArea', 'OverallQual', 'YearRemodAdd',\n", + " 'BsmtFinSF1', 'TotalBsmtSF', 'GarageCars',\n", + " 'YrSold']" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LotArea 0.0\n", + "OverallQual 0.0\n", + "YearRemodAdd 0.0\n", + "BsmtFinSF1 0.0\n", + "TotalBsmtSF 0.0\n", + "GarageCars 0.0\n", + "YrSold 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# they should not take missing values in our train data\n", + "# let's check that:\n", + "\n", + "train_data[numerical_na_not_allowed].isnull().mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LotArea 0.0\n", + "OverallQual 0.0\n", + "YearRemodAdd 0.0\n", + "BsmtFinSF1 0.0\n", + "TotalBsmtSF 0.0\n", + "GarageCars 0.0\n", + "YrSold 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's check if that is the case in our live data\n", + "# as well\n", + "\n", + "live_data[numerical_na_not_allowed].isnull().mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perfect, as we can see, none of the variables that we are receiving live, take missing data where missing data is not expected. \n", + "\n", + "If we had gotten a value other than zero. we should probably investigate what's going on. We could have a bug in our code, or the variable could have changed its definition." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distribution checks\n", + "\n", + "### First, check proportion of missing values\n", + "\n", + "We know that BsmtQual can take missing data. We can, and should test, whether the proportion of missing data that we are getting live, is the same that we considered in our training data set. We can do so, using the Chi-square test, as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 False\n", + " ... \n", + "1455 False\n", + "1456 False\n", + "1457 False\n", + "1458 False\n", + "1459 False\n", + "Name: BsmtQual, Length: 1460, dtype: bool" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check number of na in the variable in the train set\n", + "\n", + "train_data['BsmtQual'].isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 False\n", + " ... \n", + "1228 False\n", + "1229 False\n", + "1230 False\n", + "1231 False\n", + "1232 False\n", + "Name: BsmtQual, Length: 1233, dtype: bool" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check number of na in the live data\n", + "\n", + "live_data['BsmtQual'].isnull()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that we get zero, that already tells us that something may not be quite right.\n", + "\n", + "We can go ahead and test this with a statistical test. The fisher exact test as implemented in Scipy. In order to run this test, we need to create a 2x2 table, where live and train data are the columns and 0, or 1 indicating missing values are the rows, and the numbers represent the number of observations within each cell.\n", + "\n", + "Let's go ahead and do that:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# first, make 2 binary variables where we indicate if the variable took\n", + "# a missing value or not:\n", + "\n", + "train_data['BsmtQual_na'] = np.where(train_data['BsmtQual'].isnull(), 1, 0)\n", + "live_data['BsmtQual_na'] = np.where(live_data['BsmtQual'].isnull(), 1, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainlive
BsmtQual_na
014231233.0
137NaN
\n", + "
" + ], + "text/plain": [ + " train live\n", + "BsmtQual_na \n", + "0 1423 1233.0\n", + "1 37 NaN" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now let's generate the 2x2 table:\n", + "\n", + "ct = pd.concat([\n", + " train_data.groupby('BsmtQual_na')['BsmtQual_na'].count(),\n", + " live_data.groupby('BsmtQual_na')['BsmtQual_na'].count()\n", + "], axis=1)\n", + "\n", + "ct.columns = ['train', 'live']\n", + "\n", + "ct" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, the table contains missing data in the second row of the live data, because as we saw, there were no missing values. We can't pass a table with np.nan to the test, so we need to fill it out." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainlive
BsmtQual_na
014231233.0
1370.0
\n", + "
" + ], + "text/plain": [ + " train live\n", + "BsmtQual_na \n", + "0 1423 1233.0\n", + "1 37 0.0" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now we need to build a contingency table for this test\n", + "\n", + "ct.fillna(0, inplace=True)\n", + "ct" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.2719419961163252e-10" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# and now we compare frequencies with chi-square\n", + "\n", + "oddsratio, pvalue = stats.fisher_exact(ct)\n", + "\n", + "pvalue" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Aha, the p_value indicates that the distributions are significantly different. Which is not surprising given that we got 0 missing values in our live data. \n", + "\n", + "Is this a problem?\n", + "\n", + "We should start digging in our production code to see if we introduced either a bug, an exception or some sort of data filtering. If none if this happened, maybe the variable definition changed, for example.\n", + "\n", + "\n", + "### Categorical distribution test\n", + "\n", + "Similarly, we can use the same test to evaluate the proportion of categories we are getting in the categorical variables of our live data. \n", + "\n", + "As this contingency table is not a 2x2 table, we need to use a different implementation of the test, as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TA 649\n", + "Gd 618\n", + "Ex 121\n", + "Missing 37\n", + "Fa 35\n", + "Name: BsmtQual, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we first need the expected frequencies in each category\n", + "# that is the number of observations per category in the train data\n", + "\n", + "# we fill missing values with the string \"Missing\" as we did in our\n", + "# preprocessing steps\n", + "\n", + "train_data['BsmtQual'].fillna('Missing').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TA 555\n", + "Gd 489\n", + "Ex 129\n", + "Fa 60\n", + "Name: BsmtQual, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now we need the received frequencies in each category\n", + "# in the live data data\n", + "\n", + "# we also fill in missing values with the string \"Missing\"\n", + "\n", + "live_data['BsmtQual'].fillna('Missing').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# we need to create 2 series of the same size\n", + "# with the counts we displayed in the previous cells\n", + "\n", + "ct = train_data['BsmtQual'].fillna('Missing').value_counts()\n", + "cl = live_data['BsmtQual'].fillna('Missing').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TA 555\n", + "Gd 489\n", + "Ex 129\n", + "Fa 60\n", + "Name: BsmtQual, dtype: int64" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cl" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TA 555.0\n", + "Gd 489.0\n", + "Ex 129.0\n", + "Fa 60.0\n", + "Missing 0.1\n", + "Name: BsmtQual, dtype: float64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's add the missing category to the live data\n", + "# (I add 0.1 to avoid divide by zero errors in the test below)\n", + "\n", + "cl['Missing'] = 0.1\n", + "cl" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# let's sort the index\n", + "\n", + "ct.sort_index(inplace=True)\n", + "cl.sort_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ex 121\n", + "Fa 35\n", + "Gd 618\n", + "Missing 37\n", + "TA 649\n", + "Name: BsmtQual, dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ct" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ex 129.0\n", + "Fa 60.0\n", + "Gd 489.0\n", + "Missing 0.1\n", + "TA 555.0\n", + "Name: BsmtQual, dtype: float64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cl" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Power_divergenceResult(statistic=13676.964186265019, pvalue=0.0)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# and now we compare frequencies with chi-square\n", + "\n", + "stats.chisquare(f_obs=ct,\n", + " f_exp=cl)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see from the test, the p-value is 0, so the distributions in the live and train data are significantly different, which is what we expect given that we are not getting missing data in our live variable.\n", + "\n", + "For variables that are discrete in nature, like OverallQual, we could also use the above test, to compare the distributions. Let's do that:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Power_divergenceResult(statistic=137.5069575787079, pvalue=3.3726853043045783e-25)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create value counts series\n", + "\n", + "ct = train_data['OverallQual'].value_counts()\n", + "cl = live_data['OverallQual'].value_counts()\n", + "\n", + "cl[1] = 0.1\n", + "\n", + "ct.sort_index(inplace=True)\n", + "cl.sort_index(inplace=True)\n", + "\n", + "# and now we compare frequencies with chi-square\n", + "\n", + "stats.chisquare(f_obs=ct,\n", + " f_exp=cl)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This variable as well shows a different category distribution in the live data, compared to train data." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD7CAYAAACRxdTpAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAXCklEQVR4nO3df5BV5X3H8fdHJPxQArhud5DFLk2IAaxZ40ro2EypJoqYEdKhaNpJqNrQNmSSTtq0JK2NmZaZNWliJa0aGkywTYNE40Cjbf2FzaSNkkUJgpiIZiNLEFciG39X9Ns/7gO5kIV7d++9Z5fHz2vmzj3neZ5zz3eX5bNnn3vuOYoIzMwsL8cNdQFmZlZ/Dnczsww53M3MMuRwNzPLkMPdzCxDDnczswwdP9QFAJx88snR1tY21GWYmR1TNm3a9ExENPfXNyzCva2tja6urqEuw8zsmCLpJ0fq87SMmVmGHO5mZhlyuJuZZajqOXdJI4AuYFdEvE/SVGAN0ARsAj4YEf8naRRwE3AWsBe4JCK66165mb3hvfrqq/T09PDyyy8PdSkNNXr0aFpbWxk5cmTV2wzkDdWPA9uBN6f1q4FrImKNpBuAK4Dr0/OzEfFWSZemcZcMYD9mZlXp6elh3LhxtLW1IWmoy2mIiGDv3r309PQwderUqreralpGUitwEfCVtC7gXOCWNGQ1sCAtz0/rpP7zlOt33cyG1Msvv0xTU1O2wQ4giaampgH/dVLtnPs/AH8BvJ7Wm4B9EbE/rfcAk9PyZGAnQOrvS+PNzOou52A/YDBfY8Vwl/Q+4OmI2DSYoo7yukskdUnq6u3tredLm5kVYt++fVx33XUD3m7evHns27evARX9QjVz7ucAF0uaB4ymNOd+LTBB0vHp6LwV2JXG7wKmAD2SjgfGU3pj9RARsRJYCdDR0eE7htiAtS27veKY7s6LCqjEhotqfiYGotLPz4Fw/8hHPnJI+/79+zn++CPH6x133FGX+o6m4pF7RHwqIlojog24FLg3In4f2AAsTMMWA+vS8vq0Tuq/N3y7JzPL0LJly3j88cdpb2/n7LPP5t3vfjcXX3wxM2bMAGDBggWcddZZzJw5k5UrVx7crq2tjWeeeYbu7m6mT5/Ohz/8YWbOnMn555/PSy+9VJfaajnP/S+BT0jaQWlOfVVqXwU0pfZPAMtqK9HMbHjq7OzkLW95C5s3b+bzn/88Dz74INdeey0/+tGPALjxxhvZtGkTXV1drFixgr17f2kSg8cee4ylS5eybds2JkyYwK233lqX2gZ0bZmIuA+4Ly0/AczqZ8zLwO/WoTYzs2PKrFmzDjldccWKFdx2220A7Ny5k8cee4ympkPPL5k6dSrt7e0AnHXWWXR3d9ellmFx4TAzsxyccMIJB5fvu+8+7r77br73ve8xduxY5syZ0+/pjKNGjTq4PGLEiGExLWNm9oY2btw4nnvuuX77+vr6mDhxImPHjuXRRx/l/vvvL7Q2H7mbmQ1SU1MT55xzDqeffjpjxoyhpaXlYN/cuXO54YYbmD59OqeddhqzZ88utDYNhxNZOjo6wtdzt4HyqZC2fft2pk+fPtRlFKK/r1XSpojo6G+8p2XMzDLkcDczy5DD3cwsQw53M7MM+WwZsxpVemPXb+raUPCRu5lZhhzuZmY1OPHEEwH46U9/ysKFCyuMLo6nZcwsH1eNr/Pr9VU99JRTTuGWW26pPLAgPnI3M6uD7u5uTj/9dABmz57Ntm3bDvbNmTOHrq4uXnjhBS6//HJmzZrFmWeeybp16470cjVzuJuZ1dkll1zC2rVrAdi9eze7d++mo6OD5cuXc+6557Jx40Y2bNjAJz/5SV544YWG1OBwNzOrs0WLFh2colm7du3Bufg777yTzs5O2tvbD14l8sknn2xIDZ5zNzOrs8mTJ9PU1MSWLVu4+eabueGGGwCICG699VZOO+20htfgI3czswa45JJL+NznPkdfXx9nnHEGABdccAFf+tKXOHDBxoceeqhh+68Y7pJGS9oo6QeStkn6bGr/mqQfS9qcHu2pXZJWSNohaYukdzasejOzYWrhwoWsWbOGRYsWHWy78sorefXVVznjjDOYOXMmV155ZcP2X820zCvAuRHxvKSRwHcl/Ufq+2REHH7uz4XAtPR4F3B9ejYza6wBnLpYL88//zxQuun11q1bD7a3tLSwf//+Q8aOGTOGL3/5y4XUVfHIPUqeT6sj0+NoF4GfD9yUtrsfmCBpUu2lmplZtap6Q1XSCGAT8FbgnyLiAUl/AiyX9DfAPcCyiHgFmAzsLNu8J7XtrmvlZtWo9KGWITjSMytCVW+oRsRrEdEOtAKzJJ0OfAp4O3A2cBLwlwPZsaQlkrokdfX29g6wbDMzO5oBnS0TEfuADcDciNidpl5eAb4KzErDdgFTyjZrTW2Hv9bKiOiIiI7m5ubBVW9mb3jD4VahjTaYr7Gas2WaJU1Iy2OA9wKPHphHlyRgAXDgnYT1wIfSWTOzgb6I8JSMmdXd6NGj2bt3b9YBHxHs3buX0aNHD2i7aubcJwGr07z7ccDaiPi2pHslNQMCNgN/nMbfAcwDdgAvApcNqCIzsyq1trbS09ND7lO7o0ePprW1dUDbVAz3iNgCnNlP+7lHGB/A0gFVYWY2CCNHjmTq1KlDXcaw5E+ompllyOFuZpYhh7uZWYYc7mZmGXK4m5llyOFuZpYhh7uZWYYc7mZmGXK4m5llyOFuZpYhh7uZWYaqulmHmdWg0g1DoOabhrQtu73imO7Oi2rahx1bfORuZpYhh7uZWYYc7mZmGXK4m5llyOFuZpahau6hOlrSRkk/kLRN0mdT+1RJD0jaIelmSW9K7aPS+o7U39bYL8HMzA5XzZH7K8C5EfEOoB2Ym258fTVwTUS8FXgWuCKNvwJ4NrVfk8aZmVmBKoZ7lDyfVkemRwDnArek9tXAgrQ8P62T+s+TpLpVbGZmFVU15y5phKTNwNPAXcDjwL6I2J+G9ACT0/JkYCdA6u8DmupZtJmZHV1V4R4Rr0VEO9AKzALeXuuOJS2R1CWpq7e3t9aXMzOzMgM6WyYi9gEbgN8AJkg6cPmCVmBXWt4FTAFI/eOBvf281sqI6IiIjubm5kGWb2Zm/anmbJlmSRPS8hjgvcB2SiG/MA1bDKxLy+vTOqn/3oiIehZtZmZHV82FwyYBqyWNoPTLYG1EfFvSI8AaSX8HPASsSuNXAf8iaQfwM+DSBtRtZmZHUTHcI2ILcGY/7U9Qmn8/vP1l4HfrUp2ZmQ2KP6FqZpYhh7uZWYYc7mZmGXK4m5llyOFuZpYhh7uZWYYc7mZmGXK4m5llyOFuZpYhh7uZWYYc7mZmGXK4m5llyOFuZpYhh7uZWYYc7mZmGXK4m5llyOFuZpahau6hOkXSBkmPSNom6eOp/SpJuyRtTo95Zdt8StIOST+UdEEjvwAzM/tl1dxDdT/wZxHxoKRxwCZJd6W+ayLi78sHS5pB6b6pM4FTgLslvS0iXqtn4WZmdmQVj9wjYndEPJiWnwO2A5OPssl8YE1EvBIRPwZ20M+9Vs3MrHEGNOcuqY3SzbIfSE0flbRF0o2SJqa2ycDOss16OPovAzMzq7Oqw13SicCtwJ9GxM+B64G3AO3AbuALA9mxpCWSuiR19fb2DmRTMzOroKpwlzSSUrB/PSK+BRAReyLitYh4HfhnfjH1sguYUrZ5a2o7RESsjIiOiOhobm6u5WswM7PDVHO2jIBVwPaI+GJZ+6SyYe8Htqbl9cClkkZJmgpMAzbWr2QzM6ukmrNlzgE+CDwsaXNq+zTwAUntQADdwB8BRMQ2SWuBRyidabPUZ8qYmRWrYrhHxHcB9dN1x1G2WQ4sr6EuMzOrgT+hamaWIYe7mVmGHO5mZhlyuJuZZcjhbmaWIYe7mVmGqjnP3cxycNX4Cv19xdRhhfCRu5lZhhzuZmYZcribmWXI4W5mliGHu5lZhhzuZmYZcribmWXI4W5mliGHu5lZhhzuZmYZquYeqlMkbZD0iKRtkj6e2k+SdJekx9LzxNQuSSsk7ZC0RdI7G/1FmJnZoao5ct8P/FlEzABmA0slzQCWAfdExDTgnrQOcCGlm2JPA5YA19e9ajMzO6qK4R4RuyPiwbT8HLAdmAzMB1anYauBBWl5PnBTlNwPTJA0qe6Vm5nZEQ1ozl1SG3Am8ADQEhG7U9dTQEtangzsLNusJ7WZmVlBqg53SScCtwJ/GhE/L++LiABiIDuWtERSl6Su3t7egWxqZmYVVBXukkZSCvavR8S3UvOeA9Mt6fnp1L4LmFK2eWtqO0RErIyIjojoaG5uHmz9ZmbWj2rOlhGwCtgeEV8s61oPLE7Li4F1Ze0fSmfNzAb6yqZvzMysANXciekc4IPAw5I2p7ZPA53AWklXAD8BFqW+O4B5wA7gReCyulZsZmYVVQz3iPguoCN0n9fP+ACW1liXmZnVwJ9QNTPLkMPdzCxDDnczsww53M3MMuRwNzPLkMPdzCxDDnczsww53M3MMuRwNzPLkMPdzCxDDnczsww53M3MMlTNVSHNzKrStuz2o/Z3d15UUCXmI3czsww53M3MMuRwNzPLkMPdzCxD1dxD9UZJT0vaWtZ2laRdkjanx7yyvk9J2iHph5IuaFThZmZ2ZNUcuX8NmNtP+zUR0Z4edwBImgFcCsxM21wnaUS9ijUzs+pUDPeI+A7wsypfbz6wJiJeiYgfU7pJ9qwa6jMzs0GoZc79o5K2pGmbialtMrCzbExPajMzswIN9kNM1wN/C0R6/gJw+UBeQNISYAnAqaeeOsgyzOyYctX4Ksb0Nb6ON4BBHblHxJ6IeC0iXgf+mV9MvewCppQNbU1t/b3GyojoiIiO5ubmwZRhZmZHMKhwlzSpbPX9wIEzadYDl0oaJWkqMA3YWFuJZmY2UBWnZSR9A5gDnCypB/gMMEdSO6VpmW7gjwAiYpuktcAjwH5gaUS81pjSzczsSCqGe0R8oJ/mVUcZvxxYXktRZmZWG39C1cwsQw53M7MMOdzNzDLkcDczy5DD3cwsQw53M7MMOdzNzDLkcDczy5DD3cwsQw53M7MMOdzNzDLkcDczy5DD3cwsQ4O9E5O9wbUtu/2o/d2dFxVUiZn1x0fuZmYZcribmWXI4W5mlqGK4S7pRklPS9pa1naSpLskPZaeJ6Z2SVohaYekLZLe2cjizcysf9UcuX8NmHtY2zLgnoiYBtyT1gEupHRT7GnAEuD6+pRpZmYDUTHcI+I7wM8Oa54PrE7Lq4EFZe03Rcn9wARJk+pVrJmZVWewc+4tEbE7LT8FtKTlycDOsnE9qc3MzApU8xuqERFADHQ7SUskdUnq6u3trbUMMzMrM9hw33NguiU9P53adwFTysa1prZfEhErI6IjIjqam5sHWYaZmfVnsOG+HliclhcD68raP5TOmpkN9JVN35iZWUEqXn5A0jeAOcDJknqAzwCdwFpJVwA/ARal4XcA84AdwIvAZQ2o2czMKqgY7hHxgSN0ndfP2ACW1lqUZeCq8VWM6Wt8HWZvUP6EqplZhhzuZmYZcribmWXI4W5mliGHu5lZhhzuZmYZcribmWXI4W5mliGHu5lZhhzuZmYZcribmWXI4W5mliGHu5lZhipeFdLMLDuVrlqawRVLfeRuZpYhh7uZWYYc7mZmGXK4m5llqKY3VCV1A88BrwH7I6JD0knAzUAb0A0siohnayvTzMwGoh5H7r8dEe0R0ZHWlwH3RMQ04J60bmZmBWrEtMx8YHVaXg0saMA+zMzsKGoN9wDulLRJ0pLU1hIRu9PyU0BLfxtKWiKpS1JXb29vjWWYmVm5Wj/E9JsRsUvSrwB3SXq0vDMiQlL0t2FErARWAnR0dPQ7xszMBqemI/eI2JWenwZuA2YBeyRNAkjPT9dapJmZDcygw13SCZLGHVgGzge2AuuBxWnYYmBdrUWamdnA1DIt0wLcJunA6/xbRPynpO8DayVdAfwEWFR7mWZm1WlbdnvFMd2jCyhkiA063CPiCeAd/bTvBc6rpSgzM6uNP6FqZpYhh7uZWYYc7mZmGXK4m5llyOFuZpYhh7uZWYYc7mZmGXK4m5llyOFuZpYhh7uZWYZqveSvDYFK187o7ryooErMbLhyuJuZNcBQH4R5WsbMLEMOdzOzDHlaJkdXja/Q31dMHWY2ZBzu9VQpVMHBamaF8LSMmVmGGnbkLmkucC0wAvhKRHQ2al9mZsecBv+l35Ajd0kjgH8CLgRmAB+QNKMR+zIzs1/WqCP3WcCOdJ9VJK0B5gOPDPYFh/qc0apqeAPcdNfMjg2KiPq/qLQQmBsRf5jWPwi8KyI+WjZmCbAkrZ4G/LDG3Z4MPFPja9RqONQAw6OO4VADDI86hkMNMDzqGA41wPCoox41/GpENPfXMWRny0TESmBlvV5PUldEdNTr9Y7VGoZLHcOhhuFSx3CoYbjUMRxqGC51NLqGRp0tswuYUrbemtrMzKwAjQr37wPTJE2V9CbgUmB9g/ZlZmaHaci0TETsl/RR4L8onQp5Y0Rsa8S+ytRtiqcGw6EGGB51DIcaYHjUMRxqgOFRx3CoAYZHHQ2toSFvqJqZ2dDyJ1TNzDLkcDczy5DD3cwsQw73Gkh6u6TzJJ14WPvcguuYJenstDxD0ickzSuyhn5qumko959q+M30vTi/wH2+S9Kb0/IYSZ+V9O+SrpZUxcVE6lbHxyRNqTyyoTW8SdKHJL0nrf+epH+UtFTSyALr+DVJfy7pWklflPTHB/6NcpbdG6qSLouIrxawn48BS4HtQDvw8YhYl/oejIh3NrqGtK/PULqGz/HAXcC7gA3Ae4H/iojlBdRw+GmuAn4buBcgIi5udA2pjo0RMSstf5jSv89twPnAvxdx8TpJ24B3pDPGVgIvArcA56X232l0DamOPuAF4HHgG8A3I6K3iH2X1fB1Sj+XY4F9wInAtyh9LxQRiwuo4WPA+4DvAPOAh1It7wc+EhH3NbqGIRMRWT2AJwvaz8PAiWm5DeiiFPAADxX49T5M6XTTscDPgTen9jHAloJqeBD4V2AO8FvpeXda/q0CvxcPlS1/H2hOyycADxdUw/by78thfZuL/F5Q+sv8fGAV0Av8J7AYGFdQDVvS8/HAHmBEWleBP5sPl+13LHBfWj614P+n44FO4FHgZ8BeSgeGncCERuzzmJyWkbTlCI+HgZaCyjguIp4HiIhuSoF2oaQvUvrhLcr+iHgtIl4EHo+In6eaXgJeL6iGDmAT8FdAX5SOhl6KiP+OiP8uqAaA4yRNlNRE6ciwFyAiXgD2F1TDVkmXpeUfSOoAkPQ24NWCagCIiHg9Iu6MiCuAU4DrgLnAEwXVcFz6EOM4SsF6YFpqFFDYtAy/+DzPKEp/PRARTxZcw1rgWWBORJwUEU2U/rp9NvXV3bF6J6YW4AJK35hyAv63oBr2SGqPiM0AEfG8pPcBNwK/XlANAP8naWwK97MONKb53ULCPSJeB66R9M30vIeh+dkaT+mXjICQNCkidqf3RIr6hfuHwLWS/prSRaG+J2knsDP1FeWQrzciXqX0KfH1ksYWVMMqSkeqIyj94v+mpCeA2cCagmr4CvB9SQ8A7wauBpDUTOkIuihtEXF1eUNEPAVcLenyRuzwmJxzl7QK+GpEfLefvn+LiN8roIZWSkfNT/XTd05E/E+ja0j7GhURr/TTfjIwKSIeLqKOw/Z9EXBORHy66H33J4VZS0T8uMB9vhmYSumXXE9E7Clq32n/b4uIHxW5zyPUcQpARPxU0gTgPZSmTjcWWMNMYDqwNSIeLWq/h9VwJ3A3sPrAz4KkFuAPgPdGxHvqvs9jMdzNzI4lkiYCyyjd1+JXUvMeSn9NdUbE4bMQte/T4W5mNnQadYafw93MbAhJejIiTq336x6rb6iamR0zJG05UhcNOsPP4W5m1niFn+HncDcza7xvU/rQ4+bDOyTd14gdes7dzCxDx+QnVM3M7Ogc7mZmGXK4m5llyOFuZpYhh7uZWYb+H1TCZQWfUcBOAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# to investigate this further we can plot the number of observations\n", + "# per value, in train and live data, as follows:\n", + "\n", + "tmp = pd.concat([ct,cl], axis=1)\n", + "tmp.columns = ['train', 'live']\n", + "tmp.plot.bar()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " We do see that there is a slight difference in the distribution of the 2 sources of data: we expected less observations for the values 2 and 9, and more for the values 1 and 10 in the live data.\n", + " \n", + "As a follow up, we need to dig out the source of these discrepancies. Either we have a bug, or some sort of data filtering in our production code, or the variables may have changed their definitions\n", + "\n", + "For continuous variables, we can use Kolmogorov-Smirnov as follows:\n", + "\n", + "### Numerical variable tests" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# let's inspect the distributions of our variables first\n", + "\n", + "train_data.hist(bins=30, figsize=(10,10))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ks_2sampResult(statistic=0.6634714306347143, pvalue=5.1793802892230377e-256)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's test a few variables with the KS test:\n", + "\n", + "stats.ks_2samp(train_data['LotArea'], live_data['LotArea'])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ks_2sampResult(statistic=0.08687964536879646, pvalue=7.606756582567709e-05)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats.ks_2samp(train_data['GrLivArea'], live_data['GrLivArea'])" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ks_2sampResult(statistic=0.04766523347665234, pvalue=0.09196049437201348)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats.ks_2samp(train_data['BsmtFinSF1'], live_data['BsmtFinSF1'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The p-values may change slightly between yours and this notebook, but mostly the distributions seem statistically different for all the variables.\n", + "\n", + "So let's inspect this in more detail:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAD4CAYAAAAQP7oXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de3xc1X3v/c9PM7rZlm+ya7BsYyU2BGMIEGFMQhMCBGxoYtLHAdNwwmlI3XMCTXrSPAm0TZqQcBpyXic09ECAFp5S2sS4Di1OH+dACZCTEC4WARzbYCPAwTI3I8nC1m00o9/5Y6+xx2IkjYQ0s8d836+XXt6z9tprr70tz8/rstc2d0dERKQQFaWugIiIlA8FDRERKZiChoiIFExBQ0RECqagISIiBUuWugITadasWb5w4cJSV0NEpKw8+eSTb7r77Hz7juigsXDhQpqbm0tdDRGRsmJmvx1qn7qnRESkYAoaIiJSMAUNEREpWEFjGma2Avg+kAD+3t2/M2h/NfCPwAeANuASd98V9l0DXAFkgC+4+33DlWlmVwF/CrwXmO3ubw4612nAo8Aad98whmsWERlWf38/ra2t9Pb2lroqE6qmpoZ58+ZRWVlZ8DEjBg0zSwA3AR8DWoHNZrbR3bfnZLsC6HD3RWa2BrgeuMTMlgBrgBOAucADZnZsOGaoMh8B/h14eIi6XA/cX/AVioiMUmtrK3V1dSxcuBAzK3V1JoS709bWRmtrK42NjQUfV0j31DKgxd1fdPcUsA5YNSjPKuDOsL0BOMeiO70KWOfufe7+EtASyhuyTHd/KttKyeNPgB8DbxR6gSIio9Xb20t9ff0RGzAAzIz6+vpRt6YKCRoNwO6cz60hLW8ed08DnUD9MMcWUuZhzKwB+CTwgxHyrTWzZjNr3rt373BZRUSGdCQHjKyxXGM5DYT/DfBVdx8YLpO73+buTe7eNHt23mdTRERkjAoJGnuA+Tmf54W0vHnMLAlMIxoQH+rYQsocrAlYZ2a7gNXAzWZ2UQH1j7Xbf/kSl972WKmrISIxsm/fPm6++eZRH3fBBRewb9++CajRIYUEjc3AYjNrNLMqooHtjYPybAQuD9urgQc9ervTRmCNmVWbWSOwGHiiwDIP4+6N7r7Q3RcSjZt83t3/raCrjLFtr3SydU9nqashIjEyVNBIp9PDHrdp0yamT58+UdUCCpg95e7pMA32PqLpsXe4+zYzuxZodveNwO3AXWbWArQTBQFCvvXAdiANXOnuGTg4tfawMkP6F4CvAEcBW8xsk7t/blyvOkZ6Uhl6+jOlroaIxMjVV1/NCy+8wMknn0xlZSU1NTXMmDGD5557jp07d3LRRRexe/duent7+eIXv8jatWuBQ0snHThwgJUrV3LmmWfyq1/9ioaGBu69915qa2vfcd3sSH7da1NTk8d97anL73iCn+/cy/PXraQyUU5DTCJHrmeffZbjjz8egG/+ZBvbX3lrXMtfMncqf/XxE4bcv2vXLn7v936PrVu38vDDD3PhhReydevWg1Nj29vbmTlzJj09PZx22mn8/Oc/p76+/rCgsWjRIpqbmzn55JO5+OKL+cQnPsFll1027LVmmdmT7t6Ur276liqxnlTUylBrQ0SGsmzZssOepbjxxht5//vfz/Lly9m9ezfPP//8245pbGzk5JNPBuADH/gAu3btGpe6HNGr3JaD7v6oj7I3lWFqTeFPZYpIcQzXIiiWyZMnH9x++OGHeeCBB3j00UeZNGkSZ511Vt5nLaqrqw9uJxIJenp6xqUuammUWLdaGiIySF1dHfv378+7r7OzkxkzZjBp0iSee+45HnusuLMv1dIoMXVPichg9fX1fOhDH2Lp0qXU1tYyZ86cg/tWrFjBLbfcwvHHH89xxx3H8uXLi1o3BY0Sy7Y0sn+KiAD88Ic/zJteXV3NT3/607z7suMWs2bNYuvWrQfTv/zlL49bvdQ9VWLZFkavgoaIlAEFjRLKDDipdLQqirqnRKQcKGiUUHfq0NOdChoiUg4UNEqoJ6dLqkfdUyJSBhQ0Sih38LtXLQ0RKQMKGiWUGzTUPSUi5UBBo4R6+nPGNFLDviZERN5lpkyZAsArr7zC6tWrS1ybQxQ0SkgtDREZydy5c9mwYUOpq3GQgkYJaUxDREaya9culi5dCsDy5cvZtm3bwX1nnXUWzc3NdHV18dnPfpZly5ZxyimncO+9905YffREeAnlzpjKnX4rIjHy06vhtd+Mb5lHnQgrvzPqwy655BLWr1/PN7/5TV599VVeffVVmpqa+PM//3POPvts7rjjDvbt28eyZcs499xzD1vocLyopVFC2ZZGbWWCnn6NaYjI8C6++OKDXVXr168/ONZx//33853vfIeTTz754Kq3L7/88oTUQS2NEsq2LmZOrtJzGiJxNYYWwURpaGigvr6eLVu2cPfdd3PLLbcA4O78+Mc/5rjjjpvwOqilUULZQFE/pUpjGiJSkEsuuYTvfve7dHZ2ctJJJwFw/vnn87d/+7dk38T61FNPTdj5FTRKqLs/Q7LCmFpTqdlTIlKQ1atXs27dOi6++OKDaV/72tfo7+/npJNO4oQTTuBrX/vahJ1f3VMl1JPKUFuVoKYyQXtXqtTVEZEYOXDgAAALFy48bJnzOXPmkE4fPnGmtraWW2+9tSj1UkujhLpTaSZVJaitSqh7SkTKQkFBw8xWmNkOM2sxs6vz7K82s7vD/sfNbGHOvmtC+g4zO3+kMs3sqpDmZjYrJ/3TZrbFzH5jZr8ys/eP9aLjojuVYVJVkkmVCXVPiUhZGDFomFkCuAlYCSwBLjWzJYOyXQF0uPsi4Abg+nDsEmANcAKwArjZzBIjlPkIcC7w20HneAn4iLufCHwLuG2U1xo7PakMtZVRS0NBQyResoPKR7KxXGMhLY1lQIu7v+juKWAdsGpQnlXAnWF7A3COmVlIX+fufe7+EtASyhuyTHd/yt135bm4X7l7R/j4GDBvFNcZSz39GSaFMQ1NuRWJj5qaGtra2o7owOHutLW1UVNTM6rjChkIbwB253xuBU4fKo+7p82sE6gP6Y8NOrYhbI9U5nCuAPK+JNfM1gJrARYsWDCKIouvO5WhriZJbWWCvvQAmQEnUWGlrpbIu968efNobW1l7969pa7KhKqpqWHevNH9/7vsZk+Z2UeJgsaZ+fa7+22ErqumpqZY/zehJ5VhztRqaquiBl9vf4bJ1WX3VyJyxKmsrKSxsbHU1YilQrqn9gDzcz7PC2l585hZEpgGtA1zbCFlvo2ZnQT8PbDK3dsKqHusdfenozGNygSglW5FJP4KCRqbgcVm1mhmVUQD2xsH5dkIXB62VwMPetQZuBFYE2ZXNQKLgScKLPMwZrYAuAf4T+6+s7DLi7foOY0kNdmgoXENEYm5EftCwhjFVcB9QAK4w923mdm1QLO7bwRuB+4ysxagnSgIEPKtB7YDaeBKd89ANLV2cJkh/QvAV4CjgC1mtsndPwd8nWic5OZojJ20uzeN140ohWjKbTR7CrQ8uojEX0Ed6O6+Cdg0KO3rOdu9wKeGOPY64LpCygzpNwI35kn/HPC5QupbDtz94OwpdU+JSLnQE+El0ts/gDvU5gYNdU+JSMwpaJRIdln0SZWHuqfU0hCRuFPQKJHsC5gmVSU1piEiZUNBo0SyrYrc7qludU+JSMwpaJTIoZaGBsJFpHwoaJRIdkyjtipBTZUGwkWkPCholEhP7phGpcY0RKQ8KGiUSG73VGWigmSFqXtKRGJPQaNEsi2NbCujtjJBT2qglFUSERmRgkaJHHxOI4xn1OhFTCJSBhQ0SqS7/9CYRvSn3hMuIvGnoFEivakMZlBTGf0V1OrtfSJSBhQ0SqQ7vB88rNgbvfJVLQ0RiTkFjRLpDivcZqmlISLlQEGjRKIXMOUEDQ2Ei0gZUNAoke5UmkmVh15nUqvuKREpAwoaJdI9qKVRo+4pESkDChol0pMaNKZRVaEptyISewoaJdI9OGioe0pEyoCCRon09GeoqcxtaSTp6c/g7iWslYjI8BQ0SqQnPKeRVVuZwB360lp/SkTiq6CgYWYrzGyHmbWY2dV59leb2d1h/+NmtjBn3zUhfYeZnT9SmWZ2VUhzM5uVk25mdmPYt8XMTh3rRcdBX3pQSyM8Ga5xDRGJsxGDhpklgJuAlcAS4FIzWzIo2xVAh7svAm4Arg/HLgHWACcAK4CbzSwxQpmPAOcCvx10jpXA4vCzFvjB6C41XlLpAaqSh25/diaVXvkqInFWSEtjGdDi7i+6ewpYB6walGcVcGfY3gCcY9H6GKuAde7e5+4vAS2hvCHLdPen3H1XnnqsAv7RI48B083s6NFcbJz0pQeozgkaNXrlq4iUgUKCRgOwO+dza0jLm8fd00AnUD/MsYWUOZZ6YGZrzazZzJr37t07QpGlkRlw0gN+eEujUq98FZH4O+IGwt39Nndvcvem2bNnl7o6eaXCYHd18vCH+yAa6xARiatCgsYeYH7O53khLW8eM0sC04C2YY4tpMyx1KMsZINGVZ7uqd5+zZ4SkfgqJGhsBhabWaOZVRENbG8clGcjcHnYXg086NEDBxuBNWF2VSPRIPYTBZY52EbgM2EW1XKg091fLaD+sZNtTeSOaWS3NXtKROIsOVIGd0+b2VXAfUACuMPdt5nZtUCzu28EbgfuMrMWoJ0oCBDyrQe2A2ngSnfPQDS1dnCZIf0LwFeAo4AtZrbJ3T8HbAIuIBpM7wb+cLxuQrH1HeyeentLQ89piEicjRg0ANx9E9GXdm7a13O2e4FPDXHsdcB1hZQZ0m8EbsyT7sCVhdQ37vrydk+ppSEi8XfEDYSXg+EGwjWmISJxpqBRAhrTEJFypaBRAhrTEJFypaBRAvmm3KqlISLlQEGjBPryjGmYGVXJCnr1cJ+IxJiCRgnka2kA1CQr6NNAuIjEmIJGCeQbCIdoXEPLiIhInClolMCQLY3KhKbcikisKWiUQL7ZU9nPGggXkThT0CiBgw/35by5D7LdU2ppiEh8KWiUQHbcoioxuHtKLQ0RiTcFjRJIpQcwg8qEHZZenUwoaIhIrClolEBfeoCqRAXRG3EPiVoa6p4SkfhS0CiBwe8Hz6rWlFsRiTkFjRLoSw9QlUy8Lb0mqSm3IhJvCholkBqypVGhloaIxJqCRgn0pTN5g4ZaGiISdwoaJRB1T+UJGmppiEjMKWiUwJDdU8kE/RknM+AlqJWIyMgUNEog6p7KMxCu94SLSMwpaJRAKj1AdWW+7im9vU9E4q2goGFmK8xsh5m1mNnVefZXm9ndYf/jZrYwZ981IX2HmZ0/Uplm1hjKaAllVoX0BWb2kJk9ZWZbzOyCd3LhpZR9uG8wtTREJO5GDBpmlgBuAlYCS4BLzWzJoGxXAB3uvgi4Abg+HLsEWAOcAKwAbjazxAhlXg/cEMrqCGUD/CWw3t1PCWXePLZLLr2hWhrZLisFDRGJq0JaGsuAFnd/0d1TwDpg1aA8q4A7w/YG4ByL1shYBaxz9z53fwloCeXlLTMcc3Yog1DmRWHbgalhexrwyuguNT5Gbmmoe0pE4qmQoNEA7M753BrS8uZx9zTQCdQPc+xQ6fXAvlDG4HN9A7jMzFqBTcCf5Kusma01s2Yza967d28Bl1d80eyptw+EVx8c01BLQ0TiqZwGwi8F/sHd5wEXAHeZ2dvq7+63uXuTuzfNnj276JUsRF86k/c5jew0XLU0RCSuCgkae4D5OZ/nhbS8ecwsSdR91DbMsUOltwHTQxmDz3UFsB7A3R8FaoBZBdQ/doZ6TiM7e6pXLQ0RialCgsZmYHGY1VRFNAi9cVCejcDlYXs18KC7e0hfE2ZXNQKLgSeGKjMc81Aog1DmvWH7ZeAcADM7nihoxLP/aQRDPhEeuqz6NBAuIjGVHCmDu6fN7CrgPiAB3OHu28zsWqDZ3TcCtxN1F7UA7URBgJBvPbAdSANXunsGIF+Z4ZRfBdaZ2beBp0LZAH8G/J2Z/TeiQfH/HIJMWckMOOkBH/bhPj2nISJxNWLQAHD3TUSDz7lpX8/Z7gU+NcSx1wHXFVJmSH+RaHbV4PTtwIcKqW+cZd8PnndMo1JTbkUk3sppIPyIkA0a+Ve51UC4iMSbgkaRZafT5l/lVlNuRSTeFDSKrG+Yloam3IpI3CloFNnBoFH59oHwZKKCZIVpTENEYktBo8gOdk/lWUYEoi4qtTREJK4UNIrs4EB4ngULQW/vE5F4U9AosoPdU0O0NKr1nnARiTEFjSIbqaVRXVmhZUREJLYUNIos29KoSrx9IByipUS0jIiIxJWCRpEV0tLQMiIiElcKGkU24uypZEJTbkUkthQ0iqyQ2VMaCBeRuFLQKLJDYxrDPaehloaIxJOCRpGlhnkiHKKlRDSmISJxpaBRZNkxjXxrT4FaGiISbwoaRZZKD2AGyQrLu19BQ0TiTEGjyPrC+8HN8gcNdU+JSJwpaBRZX3pgyEFwiMY6+tIDlOGbbEXkXUBBo8j60gNDDoKD3hMuIvGmoFFkfenMsC2NmqTeEy4i8aWgUWSp9MCQD/bBoYf+1NIQkTgqKGiY2Qoz22FmLWZ2dZ791WZ2d9j/uJktzNl3TUjfYWbnj1SmmTWGMlpCmVU5+y42s+1mts3MfjjWiy6lkcY01NIQkTgbMWiYWQK4CVgJLAEuNbMlg7JdAXS4+yLgBuD6cOwSYA1wArACuNnMEiOUeT1wQyirI5SNmS0GrgE+5O4nAH865qsuodSIYxrZoKGWhojETyEtjWVAi7u/6O4pYB2walCeVcCdYXsDcI5Fc0pXAevcvc/dXwJaQnl5ywzHnB3KIJR5Udj+I+Amd+8AcPc3Rn+5pdeXzgz5AiY49NCfWhoiEkeFBI0GYHfO59aQljePu6eBTqB+mGOHSq8H9oUyBp/rWOBYM3vEzB4zsxX5Kmtma82s2cya9+7dW8DlFddIYxrZlobGNEQkjsppIDwJLAbOAi4F/s7Mpg/O5O63uXuTuzfNnj27yFUcWfbhvqFkp9yqpSEicVRI0NgDzM/5PC+k5c1jZklgGtA2zLFDpbcB00MZg8/VCmx09/7Q1bWTKIiUlVR6gKphg4YGwkUkvgoJGpuBxWFWUxXRwPbGQXk2ApeH7dXAgx490rwRWBNmVzUSfck/MVSZ4ZiHQhmEMu8N2/9G1MrAzGYRdVe9OMrrLbmopTH0QPjBMQ11T4lIDCVHyuDuaTO7CrgPSAB3uPs2M7sWaHb3jcDtwF1m1gK0EwUBQr71wHYgDVzp7hmAfGWGU34VWGdm3waeCmUT8p5nZtuBDPD/unvbO78FxZUaacptdkxDLQ0RiaERgwaAu28CNg1K+3rOdi/wqSGOvQ64rpAyQ/qLRLOrBqc78KXwU7b60pmCHu5TS0NE4qicBsKPCCO1NLJdV2ppiEgcKWgUWd+IU261jIiIxJeCRhFlBpz0gFOVGHogvCpRQaLC6OpLD5lHRKRUFDSK6ND7wYe+7WZGXU2S/b0KGiISPwoaRZR9P/hwYxpACBr9xaiSiMioKGgUUSEtDYCpNZW8pZaGiMSQgkYRZQe31dIQkXKloFFEfQdbGkMPhAPU1VRqTENEYklBo4h6UtGYRu0IQWOqgoaIxJSCRhG1dfUBMHNy5bD56mqSvNWj7ikRiR8FjSLq6E4BMHNy9bD5ptYkOZBKMzDgxaiWiEjBFDSKqO1ANmhUDZtvam0l7nAgpS4qEYkXBY0iau9KkawwptYMv05kXdivLioRiRsFjSLq6E4xY3IV0avQh1ZXE415aDBcROJGQaOI2g6kmDlp+K4piGZPgYKGiMSPgkYRtXelRhzPAHVPiUh8KWgUUXt3iplTCg8a+/sUNEQkXhQ0iqi9q7DuKY1piEhcKWgUSTozQGdPv7qnRKSsKWgUyb6eftyhvoDuqZrKBFXJCrU0RCR2FDSKpL0rerBvRgHdUxA9Fa7l0UUkbgoKGma2wsx2mFmLmV2dZ3+1md0d9j9uZgtz9l0T0neY2fkjlWlmjaGMllBm1aBz/T9m5mbWNJYLLpVs0KgvoHsKsosWqntKROJlxKBhZgngJmAlsAS41MyWDMp2BdDh7ouAG4Drw7FLgDXACcAK4GYzS4xQ5vXADaGsjlB2ti51wBeBx8d2uaVzsKVRYNCoU0tDRGKokJbGMqDF3V909xSwDlg1KM8q4M6wvQE4x6LHnlcB69y9z91fAlpCeXnLDMecHcoglHlRznm+RRRUekd5nSXXNsqWRp1aGiISQ4UEjQZgd87n1pCWN4+7p4FOoH6YY4dKrwf2hTIOO5eZnQrMd/f/f7jKmtlaM2s2s+a9e/cWcHnF0THKlsbU2qQGwkUkdspiINzMKoDvAX82Ul53v83dm9y9afbs2RNfuQK1d6Woq0lSOcKrXrPqqis15VZEYqeQb7A9wPycz/NCWt48ZpYEpgFtwxw7VHobMD2UkZteBywFHjazXcByYGM5DYa3d6UK7pqC7HvC1dIQkXgpJGhsBhaHWU1VRAPbGwfl2QhcHrZXAw+6u4f0NWF2VSOwGHhiqDLDMQ+FMghl3uvune4+y90XuvtC4DHgE+7ePMbrLrpC153KmlpbSU9/hv7MwATWSkRkdEYMGmF84SrgPuBZYL27bzOza83sEyHb7UC9mbUAXwKuDsduA9YD24H/DVzp7pmhygxlfRX4UiirPpRd9tpGGTQOrj+l1oaIxMjwbwMK3H0TsGlQ2tdztnuBTw1x7HXAdYWUGdJfJJpdNVx9ziqk3nHS0ZXixIapBec/tP5UYUuPiIgUQ1kMhJc7d6e9K1XwzClQS0NE4klBowgO9KVJZQZGNRCefRHTW3pWQ0RiREGjCDq6oi/+mZOrCz7m0Eq3ammISHwoaBRBW1cfADMnVxZ8zNScMQ0RkbhQ0CiCju7oafDRtDSm1mpMQ0TiR0GjCNoOhKBR4LLoAFOqQ/eUWhoiEiMKGkWQXeG2kPeDZyUTFUyqSqilISKxoqBRBO3dKaqSFUyuSozqOL1TQ0TiRkGjCNoPpJg5qYpo5ffC1dUkNXtKRGJFQaMIRrvuVFZdTZL9fWppiEh8KGgUQXt3ivpRjGdkTautZF+3goaIxIeCRhG0d6WYMYqZU1lzptbw+lt9E1AjEZGxUdAogrF2Tx09rZY3D/TRl85MQK1EREZPQWOCpdID7O9Njy1oTK8B4PVOtTZEJB4UNCbYoafBx9LSiILGK50941onEZGxUtCYYNkH+0azwm3W0dNqAXhVQUNEYkJBY4Jlg8Zo3qWRNTd0T72yr3dc6yQiMlYKGhOs7R20NCZVJZlWW8lrnQoaIhIPChoTrKNr7GMaEI1rqHtKROJCQWOCtXWlMIPpY3hOA2Du9Fp1T4lIbChoTLCOrhTTaytJVIxu3amso9TSEJEYKShomNkKM9thZi1mdnWe/dVmdnfY/7iZLczZd01I32Fm549Uppk1hjJaQplVIf1LZrbdzLaY2c/M7Jh3cuHF0t6VGtMgeNbcaTV0dPfTk9IDfiJSeiMGDTNLADcBK4ElwKVmtmRQtiuADndfBNwAXB+OXQKsAU4AVgA3m1lihDKvB24IZXWEsgGeAprc/SRgA/DdsV1ycbV19Y1pEDxL025FJE6SBeRZBrS4+4sAZrYOWAVsz8mzCvhG2N4A/C+L1gFfBaxz9z7gJTNrCeWRr0wzexY4G/iDkOfOUO4P3P2hnPM9Blw2iussmY6ufhbOmjTm47NPhb/W2ct7Zk8ZWyGpbnjlKXh9K3S3QaYfaqfDtPlw9PthRiNUqKdSREZWSNBoAHbnfG4FTh8qj7unzawTqA/pjw06tiFs5yuzHtjn7uk8+XNdAfw0X2XNbC2wFmDBggXDXVdRtHWlOPWY6WM+fm5oabwylmm3rz4Dj9wIOzZBf/eh9IokDOS8p6OqDuY1wYIzYMHyaLtq8tDlprrhzZ3w5vPQ1wnuMP0YmHsyTPmd0ddTRMpGIUEjVszsMqAJ+Ei+/e5+G3AbQFNTkxexam8zMOB0dI9tscKso8JSIq/uG0X31FuvwM++Bc/8CGqmwkmXwHEro1bF5N8BM0gdgLYX4LUtUStk9xPw8F8DDpaI8jacCpNnR59790FbC+x9Djp+G+UbzCqg8SPw4S/DwjPHfM0iEl+FBI09wPycz/NCWr48rWaWBKYBbSMcmy+9DZhuZsnQ2jjsXGZ2LvAXwEdCl1es7e9Nkxnww5dFH8jAU3fBM+ug/UWomgJzT4FF58DxH4fqusPKqKlMMHNyVWEtjVRX1LL41Y1RS+KDfwK/+2dRV9Rg1XVRy2DuyXDqZ6K0nn3QuhlefhRefgy2rIe+t6J9yVqY2RjV9f2Xwuz3wezjoHYm4NCxC1oegKd/CP9wYRSoLvweVI+xS01EYqmQoLEZWGxmjURf4Gs4NOaQtRG4HHgUWA086O5uZhuBH5rZ94C5wGLgCcDylRmOeSiUsS6UeS+AmZ0C3AqscPc33sE1F01bVxTXDr6AqacDfvQH8PKvYM6JcOz50N0Ov30Etm6Af/8SvO/C6Ev5PWdBIvrrGfEBv4FMFIQe/BbsfxWWXATnfiP6kh+N2umw+GPRT1YmvAQqUTn8sXVHRV1bZ34JfvE/4Zffg7074A/WQ92c0dVDRGJrxKARxiiuAu4DEsAd7r7NzK4Fmt19I3A7cFcY6G4nCgKEfOuJBs3TwJXungHIV2Y45VeBdWb2baIZU7eH9P8BTAH+Jbxr+2V3/8Q7vgMT6NAKt9XQ3wv/tDrqDvrkrdH/xLPvDHePuoe2rIOt90QBZPLvwImfgvddwPypCXble8Av1Q3P/iT6kn5zBzR8AD71D9GX93gZKVgMVjUJzvlaNC6y4bNw58fhivvzt3ZEpOyYe0m7/SdUU1OTNzc3l+z89297jbV3PclPrjqTE3/z1/D4D+Diu2DJMLEu3QfP3x+1HHbeBwP9pCpqeHrgvSxrOh2qp0ImFf0vfvfj0djE7PfBWVfD8aviNQvqpV/AXZ+EhR+CT28YfQASkZIwsyfdvSnfvrIbCC8n2RVu53Q/B0/cCqf90fABAyBZHY1tHP9x6O2EXYQj2KkAAA19SURBVL9kxy/upWr3ZnzrPVh/dzT7qX5R1BI5cTUs+GC8gkVW4+/Cx78P934eHvrvcO5flbpGIvIOKWhMoPbuFMYAsx6+GibNgrP/cnQF1EyD913IK5kP8Md3Pck9n/sgpy6YMTGVnSinfDoaw3nkb6Lxmnl5//MiImUihv89PXK0H0jxiconqXjl13Det8bcr7+0YRoA2/Z0jmf1iuf8/w51c+Ff/0vU/SYiZUtBYwK1H+hjbfLfYeZ7oq6kMZo7rYYZkyrZuuetcaxdEdVMg098H9qeh8dvKXVtROQdUNCYQLM6nuQEfx7OuBIqEmMux8xY2jCNra+UaUsDYNG5cOwK+Pn/gANlMWNaRPJQ0JhA57SvY3/FNDj50++4rKUN09j5+n760mW82u1510G6Bx78dqlrIiJjpKAxUd54jtP7N7P5d1ZDZe07Lm7p3Gn0Z5ydrx0Yh8qVyKxFcNrnoifi9+4sdW1EZAwUNCZI6pc30uNV7H7v4Ifnx+bEMBhe1l1UAL/7ZaicFD29LiJlR0FjIux/jeTWf+FfMh9h1px8i/SO3vyZtdTVJNlarjOosqbMjtbEenYjtD5Z6tqIyCgpaEyEx2/BPM3fZy6gYcY775qCMBg+d1r5Bw2IJgZMmgUP/FW0hIqIlA0FjfHWtx8238FvZ5/Nyz6HeeMUNABOnDeNZ1/bT39mYNzKLInqOvjIV2DXL+CFn5W6NiIyCgoa4+3X/wh9nTw4cw01lRXv6FWvg50wdyqp9AAtb5TxYHjWB/4zTF8AD3wTBso8CIq8iyhojKdMPzz2AzjmQzzR/x7mzZiEZVeyHQfZwfBfv9wxbmWWTLIaPvqX0aq/2+4pdW1EpEAKGuPpmXXQuRs++AVa93XTMH38uqYAGmdN5pj6Sfzvra+Na7klc+KnYM7S6LmNdKrUtRGRAihojJd0Cv7Pd6M32x17Pns6esZ1PAOiwfCVS4/mVy+00dF1BHzJVlTAOV+Hjpfg13eWujYiUgAFjfHy9D/Bvpfho3/BgVSGju5+5s2YNO6nufDEo8kMOP+x/XUAnnipnRv+Yydl+16UxedFS7v//LvR62pFJNYUNMZDbyc8fD3MOw0WncuejujVrOPd0gBY2jCVeTNq2bT1VTq6Unz+n5/k+z97ns27ynScwww+9k3oegN+eUOpayMiI1DQGA8PfCP60lt5PZjR2tENMG7PaOQyMy488WgeaXmTr/54C/u6+6mrTnLHL18a93MVzfxlcNIa+MX3YPfmUtdGRIahoPFO7XoEmu+A0/9r9I5uYM++iWtpAKw88Wj6M87921/n82e9l8vOOIb7t7/G7vYoWKXSA+XXXXXBd2FqA9zzR1HLTURiSUHjnejcAxs+CzMa4aN/fjC5taOH6mQFs6dUT8hp3z9vGgtmTuLYOVO48uxFfOaMYzAz/vHRXWz6zaucdt0D/Le7ny6vwFEzDX7/NuhshR9eAqnuUtdIRPJQ0Birjt/CnR+PBm/X/BCqpxzc1drRTcOM2nF9RiOXmfGjtcv50R8tpzqZ4OhptaxcehT/3yO7+Pw//5rJVQn+7elX+NsHWybk/BPmmDOiwPHyY/CjNdD1ZqlrJCKDFBQ0zGyFme0wsxYzuzrP/mozuzvsf9zMFubsuyak7zCz80cq08waQxktocyqkc5RVJl+eOqf4bazoPtNuOzHMGfJYVlaO3rG/RmNwRqm11Kf05L54w+/l9rKBFd+9L38/Csf5fdPbeB7/7GT23/5Elv3dLKv+9AUXXeneVc7T7zUzsBA1Bp56uUOrv3Jdn7x/N7StlCW/j5c9AN4+VG45UzY9q8wUMbvEBE5wthIXxBmlgB2Ah8DWoHNwKXuvj0nz+eBk9z9v5jZGuCT7n6JmS0BfgQsA+YCDwDHhsPylmlm64F73H2dmd0CPOPuPxjqHMPVvampyZubm0d3R7IGMtDfHXWTHHgd2l+E3Y/D9nvhrT3R+MUnb4veEQF09vSzd38fU2uTrPibX3D+CUfx179/4tjOPUbufrB105fOcNnfP37YrKoTG6axrHEmv3h+Lztfj5YimT+zlobptTz2YvvBfKc3zuSD751FdypNX3oAM6hOJjixYRqnLZzB7LpqzIydr+/nnl/vYUvrPo6dU8eJDdOYN6OWWXXVdPdleKmti11vdvHSm1283N5NR3eKt3rSnDB3Kp8+fQEnNExjy+59vNzezdKGaZyyYDqTqpIMDDj9e54h+W9rSbTtoGfKAnbP/gg2fxkNC49l0qxjGKicRMqq6MskONCfobW9mz37ephdV83SudOYMY7Lt8TFgb40u9u72dfdD0Blwjh6ei1HTa0hUTExrdpS6e3PsHlXO4++0MacqTV85NjZLJw1GYh+t1/c28WLe7uoqaxgdl01s6ZUUz+liurk2N+QWY4yA05fOkPbgRTPvbafPR3dHDunjvfPn87k6uSYyzWzJ929Ke++AoLGGcA33P388PkaAHf/65w894U8j5pZEngNmA1cnZs3my8c9rYyge8Ae4Gj3D2de+6hzuHDXMCYg8bWe2DDH749PVkDx3wITv9jWPSx6OG04N6n9/DFdU8f/PyVFcfx+bMWjf7c46g/M8D2V97i1c5eWt7Yz8937uXJ33Zw/NFTufyMhVQmjQ1PtrLrzW4uW34Ml5w2n5888wo3PdTCG/v7qE5WUJWsAIe+9ACpsFCiGVQmKkilB0hUGMfNqWNXWxfdqfwtgrnTalhQP4n6KdVMqkzwf57fy+tv9b0tX4VBosLoz0R/pRUMcF5FM3+Q+BnLKp6jxvrfdszLA7P5cOr7b0ufVJU4uICuE20M96ueryfRsALyFIfDkPc3WWHR39NwxxfYeMzeq3da1mjaqtl7aHbonvdnBkgPOIkKIxNaw7WVCQbc6c8MMDDECWorE3n/nsbDRDbAHc/5feXgDRycnv268xHqU2Fw1UcX8aXzjhtTfd5p0FgNrHD3z4XP/wk43d2vysmzNeRpDZ9fAE4nChCPufs/hfTbgZ+Gw95WZk7+RSF9PvBTd1861Dnc/bCObzNbC6wNH48Ddgx/eybELEAd8roPuXQvDtG9iMT5Phzj7rPz7Rh7+yWm3P024LZS1sHMmoeK0u8mug+H6F4consRKdf7UMhA+B5gfs7neSEtb57QdTQNaBvm2KHS24DpoYzB5xrqHCIiUiSFBI3NwOIwq6kKWANsHJRnI3B52F4NPBjGGjYCa8LMp0ZgMfDEUGWGYx4KZRDKvHeEc4iISJGM2D0VBqSvAu4DEsAd7r7NzK4Fmt19I3A7cJeZtQDtREGAkG89sB1IA1e6ewYgX5nhlF8F1pnZt4GnQtkMdY6YKmn3WIzoPhyie3GI7kWkLO/DiAPhIiIiWXoiXERECqagISIiBVPQGEcjLbdSrsxsl5n9xsyeNrPmkDbTzP7DzJ4Pf84I6WZmN4Z7sMXMTs0p5/KQ/3kzuzwn/QOh/JZwbGwebzazO8zsjfCcUDZtwq99qHOU0hD34htmtif8bjxtZhfk7DuylhA6VM/5ZvaQmW03s21m9sWQ/u74vXB3/YzDD9GA/gvAe4Aq4BlgSanrNU7XtguYNSjtu8DVYftq4PqwfQHRA5wGLAceD+kzgRfDnzPC9oyw74mQ18KxK0t9zTnX+WHgVGBrMa99qHPE8F58A/hynrxLwr+BaqAx/NtIDPfvBFgPrAnbtwD/NWx/HrglbK8B7i7xfTgaODVs1xEtibTk3fJ7UfJ/lEfKD3AGcF/O52uAa0pdr3G6tl28PWjsAI4O20cDO8L2rUTriB2WD7gUuDUn/daQdjTwXE76Yfni8AMsHPRFOeHXPtQ5Sv2T5158g/xB47Dff6KZkmcM9e8kfDm+CSRD+sF82WPDdjLks1Lfi5xruJdoHb13xe+FuqfGTwOwO+dza0g7Ejhwv5k9adEyLQBz3P3VsP0aMCdsD3UfhktvzZMeZ8W49qHOEUdXhW6XO3K6S0Z7L+qBfe6eHpR+WFlhf2fIX3Khq+wU4HHeJb8XChpSiDPd/VRgJXClmX04d6dH/+15V87dLsa1x/z+/gB4L3Ay8CrwP0tbneIxsynAj4E/dfe3cvcdyb8XChrjp5DlVsqSu+8Jf74B/CvRUvevm9nRAOHPN0L20S4dsydsD06Ps2Jc+1DniBV3f93dM+4+APwd0e8GHOFLCJlZJVHA+Gd3vyckvyt+LxQ0xk8hy62UHTObbGZ12W3gPGArhy/rMni5l8+EGSPLgc7QnL4POM/MZoQujPOI+qxfBd4ys+VhhshncsqKq2Jc+1DniJXsF1jwSaLfDTiClxAKf1e3A8+6+/dydr07fi9KPYh0JP0QzZLYSTQ75C9KXZ9xuqb3EM1weQbYlr0uoj7lnwHPE71ca2ZIN+CmcA9+AzTllPVZoCX8/GFOehPRl80LwP8iXoOcPyLqdukn6lu+ohjXPtQ5Yngv7grXuoXoC+3onPx/Ea5rBzkz4ob6dxJ+154I9+hfgOqQXhM+t4T97ynxfTiTqFtoC/B0+Lng3fJ7oWVERESkYOqeEhGRgiloiIhIwRQ0RESkYAoaIiJSMAUNEREpmIKGiIgUTEFDREQK9n8B8i9dzpf1VgQAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "\n", + "sns.kdeplot(train_data['LotArea'], ax=ax, label='train')\n", + "sns.kdeplot(live_data['LotArea'], ax=ax, label='live')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see from this plot, the variable distribution is dramatically different!\n", + "\n", + "This should send all sort of alerts, and we should investigate further the reason of this data shift, as the performance of our model could be impaired." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "\n", + "sns.kdeplot(train_data['GrLivArea'], ax=ax, label='train')\n", + "sns.kdeplot(live_data['GrLivArea'], ax=ax, label='live')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "\n", + "sns.kdeplot(train_data['BsmtFinSF1'], ax=ax, label='train')\n", + "sns.kdeplot(live_data['BsmtFinSF1'], ax=ax, label='live')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Finally - compare predictions\n", + "\n", + "Finally, we compare the predictions of our model.\n", + "\n", + "Given that we observed differences in the amount of missing data in BsmtQual and differences in the distribution of the other variables, there is sufficient reason to believe that the distribution of the predictions will not hold.\n", + "\n", + "Let's test that anyhow.\n", + "\n", + "In this particular scenario, we do not have the real value of the Sale Price during shadow mode, because, the houses do take some time to sell.\n", + "\n", + "So in order to evaluate the performance of the model, we can compare the distributions of the predictions between the live and train data, using again, the KS test." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ks_2sampResult(statistic=0.15722705507227056, pvalue=4.440892098500626e-15)" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats.ks_2samp(train_data['SalePrice'], live_data['SalePrice'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, the distributions are significantly different. And we can visualize that below:" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "\n", + "sns.kdeplot(train_data['SalePrice'], ax=ax, label='train')\n", + "sns.kdeplot(live_data['SalePrice'], ax=ax, label='live')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As in the live environment, we are getting houses with bigger LotArea, the predictions naturally provide bigger Sale Prices. \n", + "\n", + "Does this mean that the model will not perform well? Unclear at this stage. More expensive prices are expected for bigger houses, so in principle that should not worry us too much.\n", + "\n", + "But we should definitely investigate the reasons behind the distribution changes in the input variables.\n", + "\n", + "I hope we could give you a flavour of what we should be looking at in shadow mode, and how we should be thinking or reacting when things do not go as planned." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + }, + "toc": { + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "toc_cell": false, + "toc_position": {}, + "toc_section_display": "block", + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/exercise_notebooks/shadow_mode_exercise/requirements.txt b/exercise_notebooks/shadow_mode_exercise/requirements.txt new file mode 100644 index 0000000..0e55c35 --- /dev/null +++ b/exercise_notebooks/shadow_mode_exercise/requirements.txt @@ -0,0 +1,15 @@ +# ML requirements +numpy>=1.18.1,<1.19.0 +scikit-learn>=0.22.1,<0.23.0 +pandas>=0.25.3,<0.26.0 +feature_engine>=0.3.1,<0.4.0 +joblib>=0.14.1,<0.15.0 +matplotlib>=3.1.3,<3.2.0 +seaborn>=0.10.0,<0.11.0 +jupyter>=1.0.0,<1.1.0 + +# Persistence +sqlalchemy>=1.3.11,<1.4.0 # ORM +psycopg2>=2.8.4,<2.9.0 # DB Driver +alembic>=1.3.1,<1.4.0 # DB Migrations +sqlalchemy_utils>=0.36.0,<0.37.0 # DB Utils \ No newline at end of file From ae898dd27817c301633ed02431eec230c7024f77 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 4 Jan 2020 15:07:25 +0000 Subject: [PATCH 15/26] Monitoring with Prometheus - Basic Setup --- .../prometheus_exercise/Dockerfile | 17 ++++++++++ .../prometheus_exercise/app/__init__.py | 0 .../prometheus_exercise/app/flask_app.py | 19 +++++++++++ .../prometheus_exercise/application.py | 7 ++++ .../config/prometheus/prometheus.yml | 34 +++++++++++++++++++ .../prometheus_exercise/docker-compose.yml | 27 +++++++++++++++ .../prometheus_exercise/requirements.txt | 4 +++ 7 files changed, 108 insertions(+) create mode 100644 exercise_notebooks/prometheus_exercise/Dockerfile create mode 100644 exercise_notebooks/prometheus_exercise/app/__init__.py create mode 100644 exercise_notebooks/prometheus_exercise/app/flask_app.py create mode 100644 exercise_notebooks/prometheus_exercise/application.py create mode 100644 exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml create mode 100644 exercise_notebooks/prometheus_exercise/docker-compose.yml create mode 100644 exercise_notebooks/prometheus_exercise/requirements.txt diff --git a/exercise_notebooks/prometheus_exercise/Dockerfile b/exercise_notebooks/prometheus_exercise/Dockerfile new file mode 100644 index 0000000..4fc5705 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.7-alpine +WORKDIR /application + +COPY ./requirements.txt requirements.txt +RUN apk add --no-cache \ + gcc \ + libc-dev \ + linux-headers \ + bash; \ + pip install -r requirements.txt; + +COPY . /application + + +EXPOSE 5000 +VOLUME /application +CMD gunicorn --workers=1 --bind 0.0.0.0:5000 application:application diff --git a/exercise_notebooks/prometheus_exercise/app/__init__.py b/exercise_notebooks/prometheus_exercise/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/exercise_notebooks/prometheus_exercise/app/flask_app.py b/exercise_notebooks/prometheus_exercise/app/flask_app.py new file mode 100644 index 0000000..1bd5d9d --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/app/flask_app.py @@ -0,0 +1,19 @@ +import prometheus_client +from flask import Flask +from werkzeug.middleware.dispatcher import DispatcherMiddleware + + +def index(): + return 'home' + + +def create_app(): + main_app = Flask(__name__) + main_app.add_url_rule('/', 'index', index) + + # Add prometheus wsgi middleware to route /metrics requests + app = DispatcherMiddleware(main_app.wsgi_app, { + '/metrics': prometheus_client.make_wsgi_app() + }) + + return app diff --git a/exercise_notebooks/prometheus_exercise/application.py b/exercise_notebooks/prometheus_exercise/application.py new file mode 100644 index 0000000..e03e2a0 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/application.py @@ -0,0 +1,7 @@ +from app.flask_app import create_app + + +application = create_app() + +if __name__ == '__main__': + application.run() diff --git a/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml b/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml new file mode 100644 index 0000000..ebc951b --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml @@ -0,0 +1,34 @@ +# my global config +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + evaluation_interval: 15s # By default, scrape targets every 15 seconds. + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'my-project' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['prometheus:9090'] + - job_name: 'webapp' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + static_configs: + - targets: ['webapp:5000'] diff --git a/exercise_notebooks/prometheus_exercise/docker-compose.yml b/exercise_notebooks/prometheus_exercise/docker-compose.yml new file mode 100644 index 0000000..8817079 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/docker-compose.yml @@ -0,0 +1,27 @@ +version: '3' + +volumes: + prometheus_data: {} + +services: + webapp: + build: . + container_name: webapp + expose: + - 5000 + ports: + - 5000:5000 + volumes: + - ./:/application + prometheus: + image: prom/prometheus + container_name: prometheus + volumes: + - ./config/prometheus/:/etc/prometheus/ + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + expose: + - 9090 + ports: + - 9090:9090 diff --git a/exercise_notebooks/prometheus_exercise/requirements.txt b/exercise_notebooks/prometheus_exercise/requirements.txt new file mode 100644 index 0000000..0fbe48f --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/requirements.txt @@ -0,0 +1,4 @@ +Flask>=1.1.1,<1.2.0 +prometheus_client>=0.7.1,<0.8.0 +gunicorn>=20.0.4,<20.1.0 + From 9f70b84aa93619f1659e81042489f8257a1698bb Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 4 Jan 2020 16:18:46 +0000 Subject: [PATCH 16/26] Monitoring with Prometheus - Add Simple Metrics --- .../prometheus_exercise/app/flask_app.py | 14 ++++- .../app/helpers/__init__.py | 0 .../app/helpers/middleware.py | 58 +++++++++++++++++++ 3 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 exercise_notebooks/prometheus_exercise/app/helpers/__init__.py create mode 100644 exercise_notebooks/prometheus_exercise/app/helpers/middleware.py diff --git a/exercise_notebooks/prometheus_exercise/app/flask_app.py b/exercise_notebooks/prometheus_exercise/app/flask_app.py index 1bd5d9d..1cf0907 100644 --- a/exercise_notebooks/prometheus_exercise/app/flask_app.py +++ b/exercise_notebooks/prometheus_exercise/app/flask_app.py @@ -1,19 +1,27 @@ import prometheus_client from flask import Flask from werkzeug.middleware.dispatcher import DispatcherMiddleware +from app.helpers.middleware import setup_metrics def index(): return 'home' +def foo(): + return 'foo' + + def create_app(): main_app = Flask(__name__) main_app.add_url_rule('/', 'index', index) + main_app.add_url_rule('/foo', 'foo', foo) + setup_metrics(main_app) # Add prometheus wsgi middleware to route /metrics requests - app = DispatcherMiddleware(main_app.wsgi_app, { - '/metrics': prometheus_client.make_wsgi_app() - }) + app = DispatcherMiddleware( + app=main_app.wsgi_app, + mounts={'/metrics': prometheus_client.make_wsgi_app()} + ) return app diff --git a/exercise_notebooks/prometheus_exercise/app/helpers/__init__.py b/exercise_notebooks/prometheus_exercise/app/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py b/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py new file mode 100644 index 0000000..f547ee3 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py @@ -0,0 +1,58 @@ +from flask import request, Flask +from flask.wrappers import Response +from prometheus_client import Counter, Histogram +import time + + +# Counter and Histogram are examples of default metrics +# available from the prometheus Python client. +REQUEST_COUNT = Counter( + name='http_request_count', + documentation='App Request Count', + labelnames=['app_name', 'method', 'endpoint', 'http_status'] +) +REQUEST_LATENCY = Histogram( + name='http_request_latency_seconds', + documentation='Request latency', + labelnames=['app_name', 'endpoint'] +) + + +def start_timer() -> None: + """Get start time of a request.""" + request._prometheus_metrics_request_start_time = time.time() + + +def stop_timer(response: Response) -> Response: + """Get stop time of a request..""" + request_latency = time.time() - request._prometheus_metrics_request_start_time + REQUEST_LATENCY.labels( + app_name='webapp', + endpoint=request.path).observe(request_latency) + return response + + +def record_request_data(response: Response) -> Response: + """Capture request data. + + Uses the flask request object to extract information such as + the HTTP request method, endpoint and HTTP status. + """ + REQUEST_COUNT.labels( + app_name='webapp', + method=request.method, + endpoint=request.path, + http_status=response.status_code).inc() + return response + + +def setup_metrics(app: Flask) -> None: + """Setup Prometheus metrics. + + This function uses the flask before_request + and after_request hooks to capture metrics + with each HTTP request to the application. + """ + app.before_request(start_timer) + app.after_request(record_request_data) + app.after_request(stop_timer) From 8750b1a00fa8315902b510b10df5efe310fe05a1 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 4 Jan 2020 17:35:36 +0000 Subject: [PATCH 17/26] Monitoring with Prometheus - Setup Grafana --- .../prometheus_exercise/app/flask_app.py | 6 +- .../grafana_flask_basic_dashboard.json | 224 ++++++++++++++++++ .../prometheus_exercise/docker-compose.yml | 12 + 3 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json diff --git a/exercise_notebooks/prometheus_exercise/app/flask_app.py b/exercise_notebooks/prometheus_exercise/app/flask_app.py index 1cf0907..6b1c0c5 100644 --- a/exercise_notebooks/prometheus_exercise/app/flask_app.py +++ b/exercise_notebooks/prometheus_exercise/app/flask_app.py @@ -9,7 +9,11 @@ def index(): def foo(): - return 'foo' + foo = 1 + for x in range(100000): + foo += x + + return str(foo) def create_app(): diff --git a/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json b/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json new file mode 100644 index 0000000..b5a52ca --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json @@ -0,0 +1,224 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(http_request_count_total{job=\"webapp\"}[1m])", + "legendFormat": "{{app_name}} {{endpoint}} {{http_status}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(http_request_latency_seconds_sum{job=\"webapp\"}[1m]) / rate(http_request_latency_seconds_count{job=\"webapp\"}[1m])", + "legendFormat": "{{endpoint}} (seconds)", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Really Simple Flask Dashboard", + "uid": "q8vgEpLZk", + "version": 4 +} \ No newline at end of file diff --git a/exercise_notebooks/prometheus_exercise/docker-compose.yml b/exercise_notebooks/prometheus_exercise/docker-compose.yml index 8817079..0538cfb 100644 --- a/exercise_notebooks/prometheus_exercise/docker-compose.yml +++ b/exercise_notebooks/prometheus_exercise/docker-compose.yml @@ -2,6 +2,7 @@ version: '3' volumes: prometheus_data: {} + grafana_data: {} services: webapp: @@ -25,3 +26,14 @@ services: - 9090 ports: - 9090:9090 + grafana: + image: grafana/grafana + depends_on: + - prometheus + ports: + - 3000:3000 + volumes: + - grafana_data:/var/lib/grafana + environment: + - GF_SECURITY_ADMIN_PASSWORD=foobar + - GF_USERS_ALLOW_SIGN_UP=false From 61268655601a6eb633db9c47f144875d1eb50b0a Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 5 Jan 2020 10:31:29 +0000 Subject: [PATCH 18/26] Monitoring with Prometheus - Basic Infrastructure Metrics --- .../prometheus_exercise/app/flask_app.py | 26 +- .../grafana/basic_cadvisor_dashboard.json | 605 ++++++++++++++++++ .../config/prometheus/prometheus.yml | 8 + .../prometheus_exercise/docker-compose.yml | 12 + 4 files changed, 645 insertions(+), 6 deletions(-) create mode 100644 exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json diff --git a/exercise_notebooks/prometheus_exercise/app/flask_app.py b/exercise_notebooks/prometheus_exercise/app/flask_app.py index 6b1c0c5..9d2357d 100644 --- a/exercise_notebooks/prometheus_exercise/app/flask_app.py +++ b/exercise_notebooks/prometheus_exercise/app/flask_app.py @@ -8,18 +8,32 @@ def index(): return 'home' -def foo(): - foo = 1 - for x in range(100000): - foo += x +def cpu(): + # For older machines, you may want to lower + # this range to prevent timeouts. + for i in range(10000): + i**i - return str(foo) + return 'cpu intensive operation complete' + + +def memory(): + d = {} + # For older machines, you may want to lower + # this range to prevent timeouts. + for i in range(10000000): + i = str(i) + i += "xyz" + d[i] = i + + return 'memory intensive operation complete' def create_app(): main_app = Flask(__name__) main_app.add_url_rule('/', 'index', index) - main_app.add_url_rule('/foo', 'foo', foo) + main_app.add_url_rule('/cpu', 'cpu', cpu) + main_app.add_url_rule('/memory', 'memory', memory) setup_metrics(main_app) # Add prometheus wsgi middleware to route /metrics requests diff --git a/exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json b/exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json new file mode 100644 index 0000000..b621f02 --- /dev/null +++ b/exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json @@ -0,0 +1,605 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Docker monitoring with Prometheus and cAdvisor with node selection", + "editable": true, + "gnetId": 8321, + "graphTooltip": 1, + "id": 6, + "iteration": 1578215128428, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 0 + }, + "height": "20", + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(container_last_seen{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_last_seen", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Running containers", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "mbytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 8, + "y": 0 + }, + "height": "20", + "id": 5, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})/1024/1024", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Total Memory Usage", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 16, + "y": 0 + }, + "height": "20", + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100)", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Total CPU Usage", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 21, + "style": "dark", + "tags": [ + "docker" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "cadvisor", + "value": "cadvisor" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": "label_values(container_cpu_user_seconds_total, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "cadvisor", + "value": "cadvisor" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Host:", + "multi": false, + "name": "node", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)", + "refresh": 1, + "regex": "/([^:]+):.*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "8080", + "value": "8080" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Port", + "multi": false, + "name": "port", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)", + "refresh": 1, + "regex": "/[^:]+:(.*)/", + "skipUrlSync": false, + "sort": 3, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Docker monitoring with node selection", + "uid": "pHUTSjLZk", + "version": 2 +} \ No newline at end of file diff --git a/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml b/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml index ebc951b..19d7bd8 100644 --- a/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml +++ b/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml @@ -32,3 +32,11 @@ scrape_configs: # scheme defaults to 'http'. static_configs: - targets: ['webapp:5000'] + + - job_name: 'cadvisor' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['cadvisor:8080'] diff --git a/exercise_notebooks/prometheus_exercise/docker-compose.yml b/exercise_notebooks/prometheus_exercise/docker-compose.yml index 0538cfb..522e59b 100644 --- a/exercise_notebooks/prometheus_exercise/docker-compose.yml +++ b/exercise_notebooks/prometheus_exercise/docker-compose.yml @@ -26,6 +26,8 @@ services: - 9090 ports: - 9090:9090 + depends_on: + - cadvisor grafana: image: grafana/grafana depends_on: @@ -37,3 +39,13 @@ services: environment: - GF_SECURITY_ADMIN_PASSWORD=foobar - GF_USERS_ALLOW_SIGN_UP=false + + cadvisor: + image: google/cadvisor + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + ports: + - 8080:8080 From 2bd17e4ce8655a7f806dd6dcbbdeda3bcccdb516 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 5 Jan 2020 13:30:53 +0000 Subject: [PATCH 19/26] Monitoring with Prometheus - Instrument Project API --- packages/ml_api/Makefile | 4 + packages/ml_api/api/app.py | 4 + packages/ml_api/api/config.py | 2 + packages/ml_api/api/monitoring/__init__.py | 0 packages/ml_api/api/monitoring/middleware.py | 60 ++ .../basic_cadvisor_dashboard_ml_api.json | 605 ++++++++++++++++++ .../grafana_flask_basic_dashboard_ml_api.json | 229 +++++++ .../docker/config/prometheus/prometheus.yml | 42 ++ packages/ml_api/docker/docker-compose.yml | 44 +- packages/ml_api/requirements/requirements.txt | 6 + packages/ml_api/run.py | 12 +- 11 files changed, 1004 insertions(+), 4 deletions(-) create mode 100644 packages/ml_api/api/monitoring/__init__.py create mode 100644 packages/ml_api/api/monitoring/middleware.py create mode 100644 packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json create mode 100644 packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json create mode 100644 packages/ml_api/docker/config/prometheus/prometheus.yml diff --git a/packages/ml_api/Makefile b/packages/ml_api/Makefile index 67a1214..e82ee68 100644 --- a/packages/ml_api/Makefile +++ b/packages/ml_api/Makefile @@ -25,6 +25,10 @@ run-service-development: @echo "+ $@" python run.py +run-service-wsgi: + @echo "+ $@" + gunicorn --workers=1 --bind 0.0.0.0:5000 run:application + db-migrations: @echo "+ $@" PYTHONPATH=. alembic -c alembic.ini upgrade head diff --git a/packages/ml_api/api/app.py b/packages/ml_api/api/app.py index ccb8e48..fd625f2 100644 --- a/packages/ml_api/api/app.py +++ b/packages/ml_api/api/app.py @@ -4,6 +4,7 @@ from sqlalchemy.orm import scoped_session from api.config import Config +from api.monitoring.middleware import setup_metrics from api.persistence.core import init_database _logger = logging.getLogger(__name__) @@ -23,6 +24,9 @@ def create_app( # Setup database init_database(flask_app, config=config_object, db_session=db_session) + # Setup prometheus monitoring + setup_metrics(flask_app) + connexion_app.add_api("api.yaml") _logger.info("Application instance created") diff --git a/packages/ml_api/api/config.py b/packages/ml_api/api/config.py index 03b74be..e593629 100644 --- a/packages/ml_api/api/config.py +++ b/packages/ml_api/api/config.py @@ -14,6 +14,8 @@ # Project Directories ROOT = pathlib.Path(api.__file__).resolve().parent.parent +APP_NAME = 'ml_api' + class Config: DEBUG = False diff --git a/packages/ml_api/api/monitoring/__init__.py b/packages/ml_api/api/monitoring/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/ml_api/api/monitoring/middleware.py b/packages/ml_api/api/monitoring/middleware.py new file mode 100644 index 0000000..31712ef --- /dev/null +++ b/packages/ml_api/api/monitoring/middleware.py @@ -0,0 +1,60 @@ +from flask import request, Flask +from flask.wrappers import Response +from prometheus_client import Counter, Histogram +import time + +from api.config import APP_NAME + + +# Counter and Histogram are examples of default metrics +# available from the prometheus Python client. +REQUEST_COUNT = Counter( + name='http_request_count', + documentation='App Request Count', + labelnames=['app_name', 'method', 'endpoint', 'http_status'] +) +REQUEST_LATENCY = Histogram( + name='http_request_latency_seconds', + documentation='Request latency', + labelnames=['app_name', 'endpoint'] +) + + +def start_timer() -> None: + """Get start time of a request.""" + request._prometheus_metrics_request_start_time = time.time() + + +def stop_timer(response: Response) -> Response: + """Get stop time of a request..""" + request_latency = time.time() - request._prometheus_metrics_request_start_time + REQUEST_LATENCY.labels( + app_name=APP_NAME, + endpoint=request.path).observe(request_latency) + return response + + +def record_request_data(response: Response) -> Response: + """Capture request data. + + Uses the flask request object to extract information such as + the HTTP request method, endpoint and HTTP status. + """ + REQUEST_COUNT.labels( + app_name=APP_NAME, + method=request.method, + endpoint=request.path, + http_status=response.status_code).inc() + return response + + +def setup_metrics(app: Flask) -> None: + """Setup Prometheus metrics. + + This function uses the flask before_request + and after_request hooks to capture metrics + with each HTTP request to the application. + """ + app.before_request(start_timer) + app.after_request(record_request_data) + app.after_request(stop_timer) diff --git a/packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json b/packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json new file mode 100644 index 0000000..58b24a1 --- /dev/null +++ b/packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json @@ -0,0 +1,605 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Docker monitoring with Prometheus and cAdvisor with node selection", + "editable": true, + "gnetId": 8321, + "graphTooltip": 1, + "id": 1, + "iteration": 1578230538273, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 0 + }, + "height": "20", + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(container_last_seen{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_last_seen", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Running containers", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "mbytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 8, + "y": 0 + }, + "height": "20", + "id": 5, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})/1024/1024", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Total Memory Usage", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 16, + "y": 0 + }, + "height": "20", + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100)", + "intervalFactor": 2, + "legendFormat": "", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Total CPU Usage", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 21, + "style": "dark", + "tags": [ + "docker" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "cadvisor", + "value": "cadvisor" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": "label_values(container_cpu_user_seconds_total, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "cadvisor", + "value": "cadvisor" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Host:", + "multi": false, + "name": "node", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)", + "refresh": 1, + "regex": "/([^:]+):.*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "8080", + "value": "8080" + }, + "datasource": "Prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Port", + "multi": false, + "name": "port", + "options": [], + "query": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)", + "refresh": 1, + "regex": "/[^:]+:(.*)/", + "skipUrlSync": false, + "sort": 3, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Docker monitoring with node selection", + "uid": "pHUTSjLZk", + "version": 2 +} \ No newline at end of file diff --git a/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json b/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json new file mode 100644 index 0000000..1c92731 --- /dev/null +++ b/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json @@ -0,0 +1,229 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 2, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(http_request_count_total{job=\"ml_api\"}[1m])", + "legendFormat": "{{app_name}} {{method}} {{endpoint}} {{http_status}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (rate(http_request_latency_seconds_sum{job=\"ml_api\"}[5m])) / sum (rate(http_request_latency_seconds_count{job=\"ml_api\"}[5m]))", + "legendFormat": "Average (seconds)", + "refId": "A" + }, + { + "expr": "rate(http_request_latency_seconds_sum{job=\"ml_api\"}[5m]) / rate(http_request_latency_seconds_count{job=\"ml_api\"}[5m])", + "legendFormat": "{{endpoint}} (seconds)", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Really Simple Flask Dashboard", + "uid": "q8vgEpLZk", + "version": 6 +} \ No newline at end of file diff --git a/packages/ml_api/docker/config/prometheus/prometheus.yml b/packages/ml_api/docker/config/prometheus/prometheus.yml new file mode 100644 index 0000000..1e9fa32 --- /dev/null +++ b/packages/ml_api/docker/config/prometheus/prometheus.yml @@ -0,0 +1,42 @@ +# my global config +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + evaluation_interval: 15s # By default, scrape targets every 15 seconds. + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'my-project' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['prometheus:9090'] + - job_name: 'ml_api' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + static_configs: + - targets: ['ml_api:5000'] + + - job_name: 'cadvisor' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['cadvisor:8080'] diff --git a/packages/ml_api/docker/docker-compose.yml b/packages/ml_api/docker/docker-compose.yml index 431588c..92cc691 100644 --- a/packages/ml_api/docker/docker-compose.yml +++ b/packages/ml_api/docker/docker-compose.yml @@ -12,9 +12,10 @@ services: DB_NAME: ml_api_dev depends_on: - database + - cadvisor ports: - "5000:5000" # expose webserver to localhost host:container - command: bash -c "make db-migrations && make run-service-development" + command: bash -c "make db-migrations && make run-service-wsgi" database: image: postgres:latest @@ -28,5 +29,44 @@ services: volumes: - my_dbdata:/var/lib/postgresql/data + prometheus: + image: prom/prometheus + container_name: prometheus + volumes: + - ./config/prometheus/:/etc/prometheus/ + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + expose: + - 9090 + ports: + - 9090:9090 + depends_on: + - cadvisor + + grafana: + image: grafana/grafana + depends_on: + - prometheus + ports: + - 3000:3000 + volumes: + - grafana_data:/var/lib/grafana + environment: + - GF_SECURITY_ADMIN_PASSWORD=foobar + - GF_USERS_ALLOW_SIGN_UP=false + + cadvisor: + image: google/cadvisor + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + ports: + - 8080:8080 + volumes: - my_dbdata: + my_dbdata: {} + prometheus_data: {} + grafana_data: {} diff --git a/packages/ml_api/requirements/requirements.txt b/packages/ml_api/requirements/requirements.txt index ee8837f..4ef804c 100644 --- a/packages/ml_api/requirements/requirements.txt +++ b/packages/ml_api/requirements/requirements.txt @@ -18,3 +18,9 @@ sqlalchemy>=1.3.11,<1.4.0 # ORM psycopg2>=2.8.4,<2.9.0 # DB Driver alembic>=1.3.1,<1.4.0 # DB Migrations sqlalchemy_utils>=0.36.0,<0.37.0 # DB Utils + +# Monitoring +prometheus_client>=0.7.1,<0.8.0 + +# Deployment +gunicorn>=20.0.4,<20.1.0 diff --git a/packages/ml_api/run.py b/packages/ml_api/run.py index 898a50c..3e90817 100644 --- a/packages/ml_api/run.py +++ b/packages/ml_api/run.py @@ -1,3 +1,6 @@ +import prometheus_client +from werkzeug.middleware.dispatcher import DispatcherMiddleware + from api.app import create_app from api.config import DevelopmentConfig, setup_app_logging @@ -6,8 +9,13 @@ # setup logging as early as possible setup_app_logging(config=_config) -application = create_app(config_object=_config).app + +main_app = create_app(config_object=_config).app +application = DispatcherMiddleware( + app=main_app.wsgi_app, + mounts={'/metrics': prometheus_client.make_wsgi_app()} + ) if __name__ == "__main__": - application.run(port=_config.SERVER_PORT, host=_config.SERVER_HOST) + main_app.run(port=_config.SERVER_PORT, host=_config.SERVER_HOST) From e794349fbdb2f0dd96b7d79036702ef4e4befbab Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 12 Jan 2020 10:44:29 +0000 Subject: [PATCH 20/26] Monitoring with Prometheus - Build Grafana Dashboards for Model --- packages/ml_api/api/controller.py | 45 +- .../grafana_flask_basic_dashboard_ml_api.json | 15 +- .../config/grafana/ml_api_dashboard.json | 625 ++++++++++++++++++ packages/ml_api/scripts/populate_database.py | 39 +- packages/ml_api/tox.ini | 2 +- 5 files changed, 703 insertions(+), 23 deletions(-) create mode 100644 packages/ml_api/docker/config/grafana/ml_api_dashboard.json diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py index 8451e9a..1cee9e3 100644 --- a/packages/ml_api/api/controller.py +++ b/packages/ml_api/api/controller.py @@ -4,12 +4,44 @@ from flask import request, jsonify, Response, current_app +from gradient_boosting_model import __version__ as shadow_version +from regression_model import __version__ as live_version +from prometheus_client import Histogram, Gauge, Info from gradient_boosting_model.predict import make_prediction from api.persistence.data_access import PredictionPersistence, ModelType +from api.config import APP_NAME _logger = logging.getLogger(__name__) +PREDICTION_TRACKER = Histogram( + name='house_price_prediction_dollars', + documentation='ML Model Prediction on House Price', + labelnames=['app_name', 'model_name', 'model_version'] +) + +PREDICTION_GAUGE = Gauge( + name='house_price_gauge_dollars', + documentation='ML Model Prediction on House Price for min max calcs', + labelnames=['app_name', 'model_name', 'model_version'] +) + +PREDICTION_GAUGE.labels( + app_name=APP_NAME, + model_name=ModelType.LASSO.name, + model_version=live_version) + +MODEL_VERSIONS = Info( + 'model_version_details', + 'Capture model version information', +) + +MODEL_VERSIONS.info({ + 'live_model': ModelType.LASSO.name, + 'live_version': live_version, + 'shadow_model': ModelType.GRADIENT_BOOSTING.name, + 'shadow_version': shadow_version}) + def health(): if request.method == "GET": @@ -47,7 +79,18 @@ def predict(): _logger.warning(f"errors during prediction: {result.errors}") return Response(json.dumps(result.errors), status=400) - # Step 4: Prepare prediction response + # Step 4: Monitoring + for _prediction in result.predictions: + PREDICTION_TRACKER.labels( + app_name=APP_NAME, + model_name=ModelType.LASSO.name, + model_version=live_version).observe(_prediction) + PREDICTION_GAUGE.labels( + app_name=APP_NAME, + model_name=ModelType.LASSO.name, + model_version=live_version).set(_prediction) + + # Step 5: Prepare prediction response return jsonify( { "predictions": result.predictions, diff --git a/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json b/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json index 1c92731..39224a7 100644 --- a/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json +++ b/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json @@ -15,7 +15,7 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 2, + "id": 3, "links": [], "panels": [ { @@ -59,7 +59,7 @@ "steppedLine": false, "targets": [ { - "expr": "rate(http_request_count_total{job=\"ml_api\"}[1m])", + "expr": "rate(http_request_count_total{job=\"ml_api\"}[5m])", "legendFormat": "{{app_name}} {{method}} {{endpoint}} {{http_status}}", "refId": "A" } @@ -149,11 +149,6 @@ "expr": "sum (rate(http_request_latency_seconds_sum{job=\"ml_api\"}[5m])) / sum (rate(http_request_latency_seconds_count{job=\"ml_api\"}[5m]))", "legendFormat": "Average (seconds)", "refId": "A" - }, - { - "expr": "rate(http_request_latency_seconds_sum{job=\"ml_api\"}[5m]) / rate(http_request_latency_seconds_count{job=\"ml_api\"}[5m])", - "legendFormat": "{{endpoint}} (seconds)", - "refId": "B" } ], "thresholds": [], @@ -205,7 +200,7 @@ "list": [] }, "time": { - "from": "now-5m", + "from": "now-1h", "to": "now" }, "timepicker": { @@ -224,6 +219,6 @@ }, "timezone": "", "title": "Really Simple Flask Dashboard", - "uid": "q8vgEpLZk", - "version": 6 + "uid": "q8vgEpLZl", + "version": 3 } \ No newline at end of file diff --git a/packages/ml_api/docker/config/grafana/ml_api_dashboard.json b/packages/ml_api/docker/config/grafana/ml_api_dashboard.json new file mode 100644 index 0000000..b55b2ce --- /dev/null +++ b/packages/ml_api/docker/config/grafana/ml_api_dashboard.json @@ -0,0 +1,625 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.6.1" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Machine learning-specific metrics", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "columns": [], + "datasource": "${DS_PROMETHEUS}", + "fontSize": "100%", + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 7, + "links": [], + "options": {}, + "pageSize": 1, + "pluginVersion": "6.5.2", + "showHeader": true, + "sort": { + "col": null, + "desc": false + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "count by(live_model, live_version, shadow_model, shadow_version, version)(model_version_details_info\n* on (instance, job) group_left(version)\npython_info)", + "format": "table", + "legendFormat": "{{model_version}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Model Versions", + "transform": "table", + "transparent": true, + "type": "table" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 4 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (rate(house_price_prediction_dollars_sum{job=\"ml_api\"}[1m])) / sum (rate(house_price_prediction_dollars_count{job=\"ml_api\"}[1m]))", + "legendFormat": "Average Prediction Amount ($)", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average House Price Prediction Amount (USD)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 4 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(house_price_prediction_dollars_count{job=\"ml_api\"}[1m])", + "legendFormat": "Average Prediction Rate", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average House Price Prediction Rate (/second)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg_over_time(house_price_gauge_dollars[1m])", + "legendFormat": "AVG", + "refId": "A" + }, + { + "expr": "stddev_over_time(house_price_gauge_dollars[1m])", + "legendFormat": "STD", + "refId": "B" + }, + { + "expr": "stddev_over_time(house_price_gauge_dollars[1m]) / (sqrt(count_over_time(house_price_prediction_dollars_count[1m])))", + "legendFormat": "SEM", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Standard Error of the Mean (SEM)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 9 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(avg_over_time(house_price_gauge_dollars[1m]) - avg_over_time(house_price_gauge_dollars[1w])) / (stddev_over_time(house_price_gauge_dollars[1w]))", + "legendFormat": "Z-Score", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Standard Score (Z-Score)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 5, + "options": { + "fieldOptions": { + "calcs": [ + "logmin" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [], + "values": false + }, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.6.1", + "targets": [ + { + "expr": "house_price_gauge_dollars", + "legendFormat": "Average Prediction Amount ($)", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Min Prediction", + "type": "gauge" + }, + { + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 8, + "options": { + "fieldOptions": { + "calcs": [ + "max" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [], + "values": false + }, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.6.1", + "targets": [ + { + "expr": "house_price_gauge_dollars", + "legendFormat": "Average Prediction Amount ($)", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Max Prediction", + "type": "gauge" + } + ], + "schemaVersion": 22, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "ML API Dashboard", + "uid": "q8vgEpLZk", + "version": 2 +} \ No newline at end of file diff --git a/packages/ml_api/scripts/populate_database.py b/packages/ml_api/scripts/populate_database.py index 717c4e2..fa12d88 100644 --- a/packages/ml_api/scripts/populate_database.py +++ b/packages/ml_api/scripts/populate_database.py @@ -1,15 +1,13 @@ -from gradient_boosting_model.processing.data_management import load_dataset -from gradient_boosting_model.config.core import config -import requests -import pandas as pd - -from random import randint -from itertools import islice -import json +import argparse import os -import typing as t import time +import typing as t +from random import randint +import pandas as pd +import requests +from gradient_boosting_model.config.core import config +from gradient_boosting_model.processing.data_management import load_dataset LOCAL_URL = f'http://{os.getenv("DB_HOST", "localhost")}:5000' @@ -48,7 +46,7 @@ def _prepare_inputs(dataframe: pd.DataFrame) -> pd.DataFrame: return clean_inputs_df -def populate_database(n_predictions: int = 500) -> None: +def populate_database(n_predictions: int = 500, anomaly: bool = False) -> None: """ Manipulate the test data to generate random predictions and save them to the database. @@ -68,8 +66,18 @@ def populate_database(n_predictions: int = 500) -> None: "extend the script to handle more predictions." ) + if anomaly: + # set extremely low values to generate an outlier + n_predictions = 1 + clean_inputs_df.loc[:, "FirstFlrSF"] = 1 + clean_inputs_df.loc[:, "LotArea"] = 1 + clean_inputs_df.loc[:, "OverallQual"] = 1 + clean_inputs_df.loc[:, "GrLivArea"] = 1 + for index, data in clean_inputs_df.iterrows(): if index > n_predictions: + if anomaly: + print('Created 1 anomaly') break response = requests.post( @@ -89,4 +97,13 @@ def populate_database(n_predictions: int = 500) -> None: if __name__ == "__main__": - populate_database(n_predictions=500) + anomaly = False + parser = argparse.ArgumentParser( + description='Send random requests to House Price API.') + parser.add_argument('--anomaly', help="generate unusual inputs") + args = parser.parse_args() + if args.anomaly: + print("Generating unusual inputs") + anomaly = True + + populate_database(n_predictions=500, anomaly=anomaly) diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini index 3b499c7..09a16a0 100644 --- a/packages/ml_api/tox.ini +++ b/packages/ml_api/tox.ini @@ -100,7 +100,7 @@ setenv = PYTHONPATH=. DB_HOST={env:DB_HOST:localhost} -commands = python scripts/populate_database.py +commands = python scripts/populate_database.py {posargs} [testenv:typechecks] From 621b892fb029629bb554653914a11cfdbf4a81da Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 12 Jan 2020 10:51:11 +0000 Subject: [PATCH 21/26] Monitoring Logs with the Elastic Stack - Basic ELK Setup --- exercise_notebooks/elk_exercise/Dockerfile | 23 +++++ .../elk_exercise/app/__init__.py | 0 .../elk_exercise/app/flask_app.py | 18 ++++ .../elk_exercise/application.py | 7 ++ .../elk_exercise/docker-compose.yml | 91 +++++++++++++++++++ .../elasticsearch/config/elasticsearch.yml | 11 +++ .../elk_exercise/gunicorn_logging.conf | 46 ++++++++++ .../elk_exercise/kibana/config/kibana.yml | 13 +++ .../elk_exercise/logstash/config/logstash.yml | 12 +++ .../logstash/pipeline/logstash.conf | 17 ++++ .../elk_exercise/requirements.txt | 5 + 11 files changed, 243 insertions(+) create mode 100644 exercise_notebooks/elk_exercise/Dockerfile create mode 100644 exercise_notebooks/elk_exercise/app/__init__.py create mode 100644 exercise_notebooks/elk_exercise/app/flask_app.py create mode 100644 exercise_notebooks/elk_exercise/application.py create mode 100644 exercise_notebooks/elk_exercise/docker-compose.yml create mode 100644 exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml create mode 100644 exercise_notebooks/elk_exercise/gunicorn_logging.conf create mode 100644 exercise_notebooks/elk_exercise/kibana/config/kibana.yml create mode 100644 exercise_notebooks/elk_exercise/logstash/config/logstash.yml create mode 100644 exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf create mode 100644 exercise_notebooks/elk_exercise/requirements.txt diff --git a/exercise_notebooks/elk_exercise/Dockerfile b/exercise_notebooks/elk_exercise/Dockerfile new file mode 100644 index 0000000..86647f9 --- /dev/null +++ b/exercise_notebooks/elk_exercise/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.7-alpine +WORKDIR /application + +COPY ./requirements.txt requirements.txt +RUN apk add --no-cache \ + gcc \ + libc-dev \ + linux-headers \ + bash; \ + pip install -r requirements.txt; + +COPY . /application + + +EXPOSE 5000 +VOLUME /application +CMD gunicorn --bind 0.0.0.0:5000 \ + --workers=1 \ + --log-config gunicorn_logging.conf \ + --log-level=DEBUG \ + --access-logfile=- \ + --error-logfile=- \ + application:application diff --git a/exercise_notebooks/elk_exercise/app/__init__.py b/exercise_notebooks/elk_exercise/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/exercise_notebooks/elk_exercise/app/flask_app.py b/exercise_notebooks/elk_exercise/app/flask_app.py new file mode 100644 index 0000000..e688b45 --- /dev/null +++ b/exercise_notebooks/elk_exercise/app/flask_app.py @@ -0,0 +1,18 @@ +import logging + +from flask import Flask, current_app + + +def index(): + current_app.logger.info('home') + return 'home' + + +def create_app(): + main_app = Flask(__name__) + main_app.add_url_rule('/', 'index', index) + gunicorn_error_logger = logging.getLogger('gunicorn.error') + main_app.logger.addHandler(gunicorn_error_logger) + main_app.logger.setLevel(logging.DEBUG) + + return main_app diff --git a/exercise_notebooks/elk_exercise/application.py b/exercise_notebooks/elk_exercise/application.py new file mode 100644 index 0000000..e03e2a0 --- /dev/null +++ b/exercise_notebooks/elk_exercise/application.py @@ -0,0 +1,7 @@ +from app.flask_app import create_app + + +application = create_app() + +if __name__ == '__main__': + application.run() diff --git a/exercise_notebooks/elk_exercise/docker-compose.yml b/exercise_notebooks/elk_exercise/docker-compose.yml new file mode 100644 index 0000000..7c73164 --- /dev/null +++ b/exercise_notebooks/elk_exercise/docker-compose.yml @@ -0,0 +1,91 @@ +version: '3.2' + +services: + # The environment variable "ELK_VERSION" is used throughout this file to + # specify the version of the images to run. The default is set in the + # '.env' file in this folder. It can be overridden with any normal + # technique for setting environment variables, for example: + # + # ELK_VERSION=7.0.0-beta1 docker-compose up + # + # REF: https://docs.docker.com/compose/compose-file/#variable-substitution + webapp: + build: . + container_name: webapp + expose: + - 5000 + ports: + - 5000:5000 + links: + - logstash + networks: + - elk + depends_on: + - logstash + - kibana + - elasticsearch + volumes: + - ./:/application + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:${ELK_VERSION} + volumes: + - type: bind + source: ./elasticsearch/config/elasticsearch.yml + target: /usr/share/elasticsearch/config/elasticsearch.yml + read_only: true + - type: volume + source: elasticsearch + target: /usr/share/elasticsearch/data + ports: + - "9200:9200" + - "9300:9300" + environment: + ES_JAVA_OPTS: "-Xmx256m -Xms256m" + ELASTIC_PASSWORD: changeme + # Use single node discovery in order to disable production mode and avoid bootstrap checks + # see https://www.elastic.co/guide/en/elasticsearch/reference/current/bootstrap-checks.html + discovery.type: single-node + networks: + - elk + + logstash: + image: docker.elastic.co/logstash/logstash:${ELK_VERSION} + volumes: + - type: bind + source: ./logstash/config/logstash.yml + target: /usr/share/logstash/config/logstash.yml + read_only: true + - type: bind + source: ./logstash/pipeline + target: /usr/share/logstash/pipeline + read_only: true + ports: + - "5001:5001" + - "9600:9600" + environment: + LS_JAVA_OPTS: "-Xmx256m -Xms256m" + networks: + - elk + depends_on: + - elasticsearch + + kibana: + image: docker.elastic.co/kibana/kibana:${ELK_VERSION} + volumes: + - type: bind + source: ./kibana/config/kibana.yml + target: /usr/share/kibana/config/kibana.yml + read_only: true + ports: + - "5601:5601" + networks: + - elk + depends_on: + - elasticsearch + +networks: + elk: + driver: bridge + +volumes: + elasticsearch: \ No newline at end of file diff --git a/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml b/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml new file mode 100644 index 0000000..cbed5c3 --- /dev/null +++ b/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml @@ -0,0 +1,11 @@ +--- +## Default Elasticsearch configuration from Elasticsearch base image. +## https://github.com/elastic/elasticsearch/blob/master/distribution/docker/src/docker/config/elasticsearch.yml +cluster.name: "docker-cluster" +network.host: 0.0.0.0 + +## X-Pack settings +## see https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-xpack.html +xpack.license.self_generated.type: basic +xpack.security.enabled: true +xpack.monitoring.collection.enabled: true diff --git a/exercise_notebooks/elk_exercise/gunicorn_logging.conf b/exercise_notebooks/elk_exercise/gunicorn_logging.conf new file mode 100644 index 0000000..7ec8e8c --- /dev/null +++ b/exercise_notebooks/elk_exercise/gunicorn_logging.conf @@ -0,0 +1,46 @@ +[loggers] +keys=root, logstash.error, logstash.access + +[handlers] +keys=console, logstash + +[formatters] +keys=generic, access, json + +[logger_root] +level=INFO +handlers=console + +[logger_logstash.error] +level=INFO +handlers=logstash +propagate=1 +qualname=gunicorn.error + +[logger_logstash.access] +level=INFO +handlers=logstash +propagate=0 +qualname=gunicorn.access + +[handler_console] +class=StreamHandler +formatter=generic +args=(sys.stdout, ) + +[handler_logstash] +class=logstash.TCPLogstashHandler +formatter=json +args=('logstash', 5001) + +[formatter_generic] +format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s +datefmt=%Y-%m-%d %H:%M:%S +class=logging.Formatter + +[formatter_access] +format=%(message)s +class=logging.Formatter + +[formatter_json] +class=pythonjsonlogger.jsonlogger.JsonFormatter \ No newline at end of file diff --git a/exercise_notebooks/elk_exercise/kibana/config/kibana.yml b/exercise_notebooks/elk_exercise/kibana/config/kibana.yml new file mode 100644 index 0000000..93380e9 --- /dev/null +++ b/exercise_notebooks/elk_exercise/kibana/config/kibana.yml @@ -0,0 +1,13 @@ +--- +## Default Kibana configuration from Kibana base image. +## https://github.com/elastic/kibana/blob/master/src/dev/build/tasks/os_packages/docker_generator/templates/kibana_yml.template.js +# +server.name: kibana +server.host: "0" +elasticsearch.hosts: [ "http://elasticsearch:9200" ] +xpack.monitoring.ui.container.elasticsearch.enabled: true + +## X-Pack security credentials +# +elasticsearch.username: elastic +elasticsearch.password: changeme \ No newline at end of file diff --git a/exercise_notebooks/elk_exercise/logstash/config/logstash.yml b/exercise_notebooks/elk_exercise/logstash/config/logstash.yml new file mode 100644 index 0000000..a48c35f --- /dev/null +++ b/exercise_notebooks/elk_exercise/logstash/config/logstash.yml @@ -0,0 +1,12 @@ +--- +## Default Logstash configuration from Logstash base image. +## https://github.com/elastic/logstash/blob/master/docker/data/logstash/config/logstash-full.yml +# +http.host: "0.0.0.0" +xpack.monitoring.elasticsearch.hosts: [ "http://elasticsearch:9200" ] + +## X-Pack security credentials +# +xpack.monitoring.enabled: true +xpack.monitoring.elasticsearch.username: elastic +xpack.monitoring.elasticsearch.password: changeme diff --git a/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf b/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf new file mode 100644 index 0000000..7c273f0 --- /dev/null +++ b/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf @@ -0,0 +1,17 @@ +input { + tcp { + port => 5001 + tags => ["webapp_logs"] + type => "webapp_logs" + codec => json + } +} + +output { + elasticsearch { + hosts => "elasticsearch:9200" + user => "elastic" + password => "changeme" + index => "webapp_logs-%{+YYYY.MM.dd}" + } +} \ No newline at end of file diff --git a/exercise_notebooks/elk_exercise/requirements.txt b/exercise_notebooks/elk_exercise/requirements.txt new file mode 100644 index 0000000..6607dd0 --- /dev/null +++ b/exercise_notebooks/elk_exercise/requirements.txt @@ -0,0 +1,5 @@ +Flask>=1.1.1,<1.2.0 +python3-logstash>=0.4.80,<0.5.0 +python-json-logger>=0.1.11,<0.2.0 +gunicorn>=20.0.4,<20.1.0 + From 3a9bbf4d542e34bc8ff7a5b12687b56ca39d34f5 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 8 Feb 2020 12:49:56 +0000 Subject: [PATCH 22/26] Monitoring Logs with the Elastic Stack - Integrate with API --- packages/ml_api/Makefile | 8 +- packages/ml_api/api/app.py | 2 +- packages/ml_api/api/config.py | 9 +- packages/ml_api/api/controller.py | 21 ++-- packages/ml_api/api/persistence/core.py | 2 +- .../ml_api/api/persistence/data_access.py | 7 +- packages/ml_api/docker/docker-compose-elk.yml | 99 +++++++++++++++++++ .../elasticsearch/config/elasticsearch.yml | 10 ++ .../ml_api/docker/kibana/config/kibana.yml | 12 +++ .../docker/logstash/config/logstash.yml | 11 +++ .../docker/logstash/pipeline/logstash.conf | 17 ++++ packages/ml_api/gunicorn_logging.conf | 49 +++++++++ packages/ml_api/requirements/requirements.txt | 6 +- packages/ml_api/run.py | 2 - 14 files changed, 235 insertions(+), 20 deletions(-) create mode 100644 packages/ml_api/docker/docker-compose-elk.yml create mode 100644 packages/ml_api/docker/elasticsearch/config/elasticsearch.yml create mode 100644 packages/ml_api/docker/kibana/config/kibana.yml create mode 100644 packages/ml_api/docker/logstash/config/logstash.yml create mode 100644 packages/ml_api/docker/logstash/pipeline/logstash.conf create mode 100644 packages/ml_api/gunicorn_logging.conf diff --git a/packages/ml_api/Makefile b/packages/ml_api/Makefile index e82ee68..a544913 100644 --- a/packages/ml_api/Makefile +++ b/packages/ml_api/Makefile @@ -27,7 +27,13 @@ run-service-development: run-service-wsgi: @echo "+ $@" - gunicorn --workers=1 --bind 0.0.0.0:5000 run:application + gunicorn --bind 0.0.0.0:5000 \ + --workers=1 \ + --log-config gunicorn_logging.conf \ + --log-level=DEBUG \ + --access-logfile=- \ + --error-logfile=- \ + run:application db-migrations: @echo "+ $@" diff --git a/packages/ml_api/api/app.py b/packages/ml_api/api/app.py index fd625f2..f8f74e0 100644 --- a/packages/ml_api/api/app.py +++ b/packages/ml_api/api/app.py @@ -7,7 +7,7 @@ from api.monitoring.middleware import setup_metrics from api.persistence.core import init_database -_logger = logging.getLogger(__name__) +_logger = logging.getLogger('mlapi') def create_app( diff --git a/packages/ml_api/api/config.py b/packages/ml_api/api/config.py index e593629..1f528c4 100644 --- a/packages/ml_api/api/config.py +++ b/packages/ml_api/api/config.py @@ -2,10 +2,10 @@ import os import pathlib import sys +from logging.config import fileConfig import api - # logging format FORMATTER = logging.Formatter( "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s" @@ -82,10 +82,9 @@ def get_console_handler(): def setup_app_logging(config: Config) -> None: """Prepare custom logging for our application.""" _disable_irrelevant_loggers() - root = logging.getLogger() - root.setLevel(config.LOGGING_LEVEL) - root.addHandler(get_console_handler()) - root.propagate = False + fileConfig(ROOT / 'gunicorn_logging.conf') + logger = logging.getLogger('mlapi') + logger.setLevel(config.LOGGING_LEVEL) def _disable_irrelevant_loggers() -> None: diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py index 1cee9e3..7e986e6 100644 --- a/packages/ml_api/api/controller.py +++ b/packages/ml_api/api/controller.py @@ -3,16 +3,16 @@ import threading from flask import request, jsonify, Response, current_app +from prometheus_client import Histogram, Gauge, Info +from regression_model import __version__ as live_version +from api.config import APP_NAME +from api.persistence.data_access import PredictionPersistence, ModelType from gradient_boosting_model import __version__ as shadow_version -from regression_model import __version__ as live_version -from prometheus_client import Histogram, Gauge, Info from gradient_boosting_model.predict import make_prediction -from api.persistence.data_access import PredictionPersistence, ModelType -from api.config import APP_NAME +_logger = logging.getLogger('mlapi') -_logger = logging.getLogger(__name__) PREDICTION_TRACKER = Histogram( name='house_price_prediction_dollars', @@ -45,13 +45,18 @@ def health(): if request.method == "GET": - return jsonify({"status": "ok"}) + status = {"status": "ok"} + _logger.debug(status) + return jsonify(status) def predict(): if request.method == "POST": # Step 1: Extract POST data from request body as JSON json_data = request.get_json() + _logger.info( + f'Inputs for model: {ModelType.LASSO.name} ' + f'Input values: {json_data}') # Step 2a: Get and save live model predictions persistence = PredictionPersistence(db_session=current_app.db_session) @@ -89,6 +94,10 @@ def predict(): app_name=APP_NAME, model_name=ModelType.LASSO.name, model_version=live_version).set(_prediction) + _logger.info( + f'Prediction results for model: {ModelType.LASSO.name} ' + f'version: {result.model_version} ' + f'Output values: {result.predictions}') # Step 5: Prepare prediction response return jsonify( diff --git a/packages/ml_api/api/persistence/core.py b/packages/ml_api/api/persistence/core.py index 7fd231a..aab5555 100644 --- a/packages/ml_api/api/persistence/core.py +++ b/packages/ml_api/api/persistence/core.py @@ -11,7 +11,7 @@ from api.config import Config, ROOT -_logger = logging.getLogger(__name__) +_logger = logging.getLogger('mlapi') # Base class for SQLAlchemy models Base = declarative_base() diff --git a/packages/ml_api/api/persistence/data_access.py b/packages/ml_api/api/persistence/data_access.py index 4feb247..f46d24f 100644 --- a/packages/ml_api/api/persistence/data_access.py +++ b/packages/ml_api/api/persistence/data_access.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd -from gradient_boosting_model.predict import make_prediction as make_shadow_prediction from regression_model.predict import make_prediction as make_live_prediction from sqlalchemy.orm.session import Session @@ -13,6 +12,10 @@ LassoModelPredictions, GradientBoostingModelPredictions, ) +from gradient_boosting_model.predict import make_prediction as make_shadow_prediction + +_logger = logging.getLogger('mlapi') + SECONDARY_VARIABLES_TO_RENAME = { "FirstFlrSF": "1stFlrSF", @@ -20,8 +23,6 @@ "ThreeSsnPortch": "3SsnPorch", } -_logger = logging.getLogger(__name__) - class ModelType(enum.Enum): LASSO = "lasso" diff --git a/packages/ml_api/docker/docker-compose-elk.yml b/packages/ml_api/docker/docker-compose-elk.yml new file mode 100644 index 0000000..3ed1806 --- /dev/null +++ b/packages/ml_api/docker/docker-compose-elk.yml @@ -0,0 +1,99 @@ +version: '3.2' +services: + ml_api: + build: + context: ../ + dockerfile: docker/Dockerfile + environment: + DB_HOST: database + DB_PORT: 5432 + DB_USER: user + DB_PASSWORD: ${DB_PASSWORD:-password} + DB_NAME: ml_api_dev + networks: + - elk + depends_on: + - database + - logstash + ports: + - "5000:5000" # expose webserver to localhost host:container + command: bash -c "make db-migrations && make run-service-wsgi" + + database: + image: postgres:latest + environment: + POSTGRES_USER: user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_dev + ports: + # expose postgres container on different host port to default (host:container) + - "6609:5432" + volumes: + - my_dbdata:/var/lib/postgresql/data + networks: + - elk + + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:${ELK_VERSION} + volumes: + - type: bind + source: ./elasticsearch/config/elasticsearch.yml + target: /usr/share/elasticsearch/config/elasticsearch.yml + read_only: true + - type: volume + source: elasticsearch + target: /usr/share/elasticsearch/data + ports: + - "9200:9200" + - "9300:9300" + environment: + ES_JAVA_OPTS: "-Xmx256m -Xms256m" + ELASTIC_PASSWORD: changeme + # Use single node discovery in order to disable production mode and avoid bootstrap checks + # see https://www.elastic.co/guide/en/elasticsearch/reference/current/bootstrap-checks.html + discovery.type: single-node + networks: + - elk + + logstash: + image: docker.elastic.co/logstash/logstash:${ELK_VERSION} + volumes: + - type: bind + source: ./logstash/config/logstash.yml + target: /usr/share/logstash/config/logstash.yml + read_only: true + - type: bind + source: ./logstash/pipeline + target: /usr/share/logstash/pipeline + read_only: true + ports: + - "5001:5001" + - "9600:9600" + environment: + LS_JAVA_OPTS: "-Xmx256m -Xms256m" + networks: + - elk + depends_on: + - elasticsearch + + kibana: + image: docker.elastic.co/kibana/kibana:${ELK_VERSION} + volumes: + - type: bind + source: ./kibana/config/kibana.yml + target: /usr/share/kibana/config/kibana.yml + read_only: true + ports: + - "5601:5601" + networks: + - elk + depends_on: + - elasticsearch + +networks: + elk: + driver: bridge + +volumes: + my_dbdata: + elasticsearch: \ No newline at end of file diff --git a/packages/ml_api/docker/elasticsearch/config/elasticsearch.yml b/packages/ml_api/docker/elasticsearch/config/elasticsearch.yml new file mode 100644 index 0000000..b831729 --- /dev/null +++ b/packages/ml_api/docker/elasticsearch/config/elasticsearch.yml @@ -0,0 +1,10 @@ +## Default Elasticsearch configuration from Elasticsearch base image. +## https://github.com/elastic/elasticsearch/blob/master/distribution/docker/src/docker/config/elasticsearch.yml +cluster.name: "docker-cluster" +network.host: 0.0.0.0 + +## X-Pack settings +## see https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-xpack.html +xpack.license.self_generated.type: basic +xpack.security.enabled: true +xpack.monitoring.collection.enabled: true diff --git a/packages/ml_api/docker/kibana/config/kibana.yml b/packages/ml_api/docker/kibana/config/kibana.yml new file mode 100644 index 0000000..6f0a087 --- /dev/null +++ b/packages/ml_api/docker/kibana/config/kibana.yml @@ -0,0 +1,12 @@ +--- +## Default Kibana configuration from Kibana base image. +## https://github.com/elastic/kibana/blob/master/src/dev/build/tasks/os_packages/docker_generator/templates/kibana_yml.template.js + +server.name: kibana +server.host: "0" +elasticsearch.hosts: [ "http://elasticsearch:9200" ] +xpack.monitoring.ui.container.elasticsearch.enabled: true + +## X-Pack security credentials +elasticsearch.username: elastic +elasticsearch.password: changeme diff --git a/packages/ml_api/docker/logstash/config/logstash.yml b/packages/ml_api/docker/logstash/config/logstash.yml new file mode 100644 index 0000000..9f69eac --- /dev/null +++ b/packages/ml_api/docker/logstash/config/logstash.yml @@ -0,0 +1,11 @@ +--- +## Default Logstash configuration from Logstash base image. +## https://github.com/elastic/logstash/blob/master/docker/data/logstash/config/logstash-full.yml +# +http.host: "0.0.0.0" +xpack.monitoring.elasticsearch.hosts: [ "http://elasticsearch:9200" ] + +## X-Pack security credentials +xpack.monitoring.enabled: true +xpack.monitoring.elasticsearch.username: elastic +xpack.monitoring.elasticsearch.password: changeme \ No newline at end of file diff --git a/packages/ml_api/docker/logstash/pipeline/logstash.conf b/packages/ml_api/docker/logstash/pipeline/logstash.conf new file mode 100644 index 0000000..2123bd0 --- /dev/null +++ b/packages/ml_api/docker/logstash/pipeline/logstash.conf @@ -0,0 +1,17 @@ +input { + tcp { + port => 5001 + tags => ["webapp_logs"] + type => "webapp_logs" + codec => json + } +} + +output { + elasticsearch { + hosts => "elasticsearch:9200" + user => "elastic" + password => "changeme" + index => "webapp_logs-%{+YYYY.MM.dd}" + } +} diff --git a/packages/ml_api/gunicorn_logging.conf b/packages/ml_api/gunicorn_logging.conf new file mode 100644 index 0000000..f4a2bdf --- /dev/null +++ b/packages/ml_api/gunicorn_logging.conf @@ -0,0 +1,49 @@ +[loggers] +keys=root, mlapi, logstash.error, logstash.access + +[handlers] +keys=console, logstash + +[formatters] +keys=generic, json + +[logger_root] +level=INFO +handlers=console +propagate=1 + +[logger_mlapi] +level=INFO +handlers=console,logstash +propagate=0 +qualname=mlapi + +[logger_logstash.error] +level=INFO +handlers=logstash +propagate=1 +qualname=gunicorn.error + +[logger_logstash.access] +level=INFO +handlers=logstash +propagate=0 +qualname=gunicorn.access + +[handler_console] +class=StreamHandler +formatter=generic +args=(sys.stdout, ) + +[handler_logstash] +class=logstash.TCPLogstashHandler +formatter=json +args=('logstash', 5001) + +[formatter_generic] +format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s +datefmt=%Y-%m-%d %H:%M:%S +class=logging.Formatter + +[formatter_json] +class=pythonjsonlogger.jsonlogger.JsonFormatter diff --git a/packages/ml_api/requirements/requirements.txt b/packages/ml_api/requirements/requirements.txt index 4ef804c..2cab5cd 100644 --- a/packages/ml_api/requirements/requirements.txt +++ b/packages/ml_api/requirements/requirements.txt @@ -19,8 +19,12 @@ psycopg2>=2.8.4,<2.9.0 # DB Driver alembic>=1.3.1,<1.4.0 # DB Migrations sqlalchemy_utils>=0.36.0,<0.37.0 # DB Utils -# Monitoring +# Metrics prometheus_client>=0.7.1,<0.8.0 +# Logging +python3-logstash>=0.4.80,<0.5.0 +python-json-logger>=0.1.11,<0.2.0 + # Deployment gunicorn>=20.0.4,<20.1.0 diff --git a/packages/ml_api/run.py b/packages/ml_api/run.py index 3e90817..5bb5e0a 100644 --- a/packages/ml_api/run.py +++ b/packages/ml_api/run.py @@ -4,12 +4,10 @@ from api.app import create_app from api.config import DevelopmentConfig, setup_app_logging - _config = DevelopmentConfig() # setup logging as early as possible setup_app_logging(config=_config) - main_app = create_app(config_object=_config).app application = DispatcherMiddleware( app=main_app.wsgi_app, From f72cfdb97bae83ca98d18f7061df9b063ff45e1f Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 15 Feb 2020 08:52:22 +0000 Subject: [PATCH 23/26] Monitoring Logs with the Elastic Stack - Create Kibana Dashboard for Model Inputs --- packages/ml_api/api/controller.py | 5 ++--- .../kibana_example_inputs_dashboard.ndjson | 5 +++++ .../docker/logstash/pipeline/logstash.conf | 21 +++++++++++++------ packages/ml_api/scripts/populate_database.py | 16 ++++++++++++-- packages/ml_api/tox.ini | 2 +- 5 files changed, 37 insertions(+), 12 deletions(-) create mode 100644 packages/ml_api/docker/kibana/config/kibana_example_inputs_dashboard.ndjson diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py index 7e986e6..47d9f07 100644 --- a/packages/ml_api/api/controller.py +++ b/packages/ml_api/api/controller.py @@ -54,9 +54,8 @@ def predict(): if request.method == "POST": # Step 1: Extract POST data from request body as JSON json_data = request.get_json() - _logger.info( - f'Inputs for model: {ModelType.LASSO.name} ' - f'Input values: {json_data}') + for entry in json_data: + _logger.info(entry) # Step 2a: Get and save live model predictions persistence = PredictionPersistence(db_session=current_app.db_session) diff --git a/packages/ml_api/docker/kibana/config/kibana_example_inputs_dashboard.ndjson b/packages/ml_api/docker/kibana/config/kibana_example_inputs_dashboard.ndjson new file mode 100644 index 0000000..266941d --- /dev/null +++ b/packages/ml_api/docker/kibana/config/kibana_example_inputs_dashboard.ndjson @@ -0,0 +1,5 @@ +{"attributes":{"fields":"[{\"name\":\"1stFlrSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"2ndFlrSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"3SsnPorch\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"@timestamp\",\"type\":\"date\",\"esTypes\":[\"date\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"@version\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"@version.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"@version\",\"subType\":\"multi\"},{\"name\":\"Alley\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Alley.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Alley\",\"subType\":\"multi\"},{\"name\":\"BedroomAbvGr\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BldgType\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BldgType.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BldgType\",\"subType\":\"multi\"},{\"name\":\"BsmtCond\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtCond.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtCond\",\"subType\":\"multi\"},{\"name\":\"BsmtExposure\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtExposure.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtExposure\",\"subType\":\"multi\"},{\"name\":\"BsmtFinSF1\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BsmtFinSF2\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BsmtFinType1\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtFinType1.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtFinType1\",\"subType\":\"multi\"},{\"name\":\"BsmtFinType2\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtFinType2.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtFinType2\",\"subType\":\"multi\"},{\"name\":\"BsmtFullBath\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BsmtHalfBath\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"BsmtQual\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"BsmtQual.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"BsmtQual\",\"subType\":\"multi\"},{\"name\":\"BsmtUnfSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"CentralAir\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"CentralAir.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"CentralAir\",\"subType\":\"multi\"},{\"name\":\"Condition1\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Condition1.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Condition1\",\"subType\":\"multi\"},{\"name\":\"Condition2\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Condition2.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Condition2\",\"subType\":\"multi\"},{\"name\":\"Electrical\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Electrical.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Electrical\",\"subType\":\"multi\"},{\"name\":\"EnclosedPorch\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"ExterCond\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"ExterCond.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"ExterCond\",\"subType\":\"multi\"},{\"name\":\"ExterQual\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"ExterQual.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"ExterQual\",\"subType\":\"multi\"},{\"name\":\"Exterior1st\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Exterior1st.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Exterior1st\",\"subType\":\"multi\"},{\"name\":\"Exterior2nd\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Exterior2nd.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Exterior2nd\",\"subType\":\"multi\"},{\"name\":\"Fence\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Fence.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Fence\",\"subType\":\"multi\"},{\"name\":\"FireplaceQu\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"FireplaceQu.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"FireplaceQu\",\"subType\":\"multi\"},{\"name\":\"Fireplaces\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"FirstFlrSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Foundation\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Foundation.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Foundation\",\"subType\":\"multi\"},{\"name\":\"FullBath\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Functional\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Functional.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Functional\",\"subType\":\"multi\"},{\"name\":\"GarageArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"GarageCars\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"GarageCond\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"GarageCond.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"GarageCond\",\"subType\":\"multi\"},{\"name\":\"GarageFinish\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"GarageFinish.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"GarageFinish\",\"subType\":\"multi\"},{\"name\":\"GarageQual\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"GarageQual.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"GarageQual\",\"subType\":\"multi\"},{\"name\":\"GarageType\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"GarageType.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"GarageType\",\"subType\":\"multi\"},{\"name\":\"GarageYrBlt\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"GrLivArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"HalfBath\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Heating\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Heating.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Heating\",\"subType\":\"multi\"},{\"name\":\"HeatingQC\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"HeatingQC.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"HeatingQC\",\"subType\":\"multi\"},{\"name\":\"HouseStyle\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"HouseStyle.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"HouseStyle\",\"subType\":\"multi\"},{\"name\":\"Id\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"KitchenAbvGr\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"KitchenQual\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"KitchenQual.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"KitchenQual\",\"subType\":\"multi\"},{\"name\":\"LandContour\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"LandContour.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"LandContour\",\"subType\":\"multi\"},{\"name\":\"LandSlope\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"LandSlope.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"LandSlope\",\"subType\":\"multi\"},{\"name\":\"LotArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"LotConfig\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"LotConfig.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"LotConfig\",\"subType\":\"multi\"},{\"name\":\"LotFrontage\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"LotShape\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"LotShape.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"LotShape\",\"subType\":\"multi\"},{\"name\":\"LowQualFinSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"MSSubClass\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"MSZoning\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"MSZoning.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"MSZoning\",\"subType\":\"multi\"},{\"name\":\"MasVnrArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"MasVnrType\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"MasVnrType.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"MasVnrType\",\"subType\":\"multi\"},{\"name\":\"MiscFeature\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"MiscFeature.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"MiscFeature\",\"subType\":\"multi\"},{\"name\":\"MiscVal\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"MoSold\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Neighborhood\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Neighborhood.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Neighborhood\",\"subType\":\"multi\"},{\"name\":\"OpenPorchSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"OverallCond\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"OverallQual\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"PavedDrive\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"PavedDrive.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"PavedDrive\",\"subType\":\"multi\"},{\"name\":\"PoolArea\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"RoofMatl\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"RoofMatl.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"RoofMatl\",\"subType\":\"multi\"},{\"name\":\"RoofStyle\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"RoofStyle.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"RoofStyle\",\"subType\":\"multi\"},{\"name\":\"SaleCondition\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"SaleCondition.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"SaleCondition\",\"subType\":\"multi\"},{\"name\":\"SaleType\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"SaleType.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"SaleType\",\"subType\":\"multi\"},{\"name\":\"ScreenPorch\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"SecondFlrSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Street\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Street.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Street\",\"subType\":\"multi\"},{\"name\":\"ThreeSsnPortch\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"TotRmsAbvGrd\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"TotalBsmtSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"Utilities\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"Utilities.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"Utilities\",\"subType\":\"multi\"},{\"name\":\"WoodDeckSF\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"YearBuilt\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"YearRemodAdd\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"YrSold\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"_id\",\"type\":\"string\",\"esTypes\":[\"_id\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"esTypes\":[\"_index\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"esTypes\":[\"_source\"],\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"esTypes\":[\"_type\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"host\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"host.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"host\",\"subType\":\"multi\"},{\"name\":\"message\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"message.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"message\",\"subType\":\"multi\"},{\"name\":\"port\",\"type\":\"number\",\"esTypes\":[\"long\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"tags\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"tags.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"tags\",\"subType\":\"multi\"},{\"name\":\"type\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"type.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"parent\":\"type\",\"subType\":\"multi\"}]","timeFieldName":"@timestamp","title":"input*"},"id":"61d12f10-4b74-11ea-a505-e57bbdfb6038","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-02-15T08:38:19.009Z","version":"WzExLDJd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"BsmtQual Pie Chart","uiStateJSON":"{\"vis\":{\"legendOpen\":true}}","version":1,"visState":"{\"title\":\"BsmtQual Pie Chart\",\"type\":\"pie\",\"params\":{\"type\":\"pie\",\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"isDonut\":true,\"labels\":{\"show\":false,\"values\":true,\"last_level\":true,\"truncate\":100},\"dimensions\":{\"metric\":{\"accessor\":0,\"format\":{\"id\":\"number\"},\"params\":{},\"aggType\":\"count\"}}},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"BsmtQual.keyword\",\"orderBy\":\"1\",\"order\":\"desc\",\"size\":5,\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\"}}]}"},"id":"1d3afa50-4b76-11ea-a505-e57bbdfb6038","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"61d12f10-4b74-11ea-a505-e57bbdfb6038","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-02-15T08:38:19.009Z","version":"WzEyLDJd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"LotArea Line Graph","uiStateJSON":"{}","version":1,"visState":"{\"title\":\"LotArea Line Graph\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":false},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"filter\":true,\"truncate\":100},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Count\"}}],\"seriesParams\":[{\"show\":true,\"type\":\"histogram\",\"mode\":\"stacked\",\"data\":{\"label\":\"Count\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"lineWidth\":2,\"showCircles\":true}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":false,\"labels\":{\"show\":false},\"thresholdLine\":{\"show\":false,\"value\":10,\"width\":1,\"style\":\"full\",\"color\":\"#34130C\"},\"dimensions\":{\"x\":null,\"y\":[{\"accessor\":0,\"format\":{\"id\":\"number\"},\"params\":{},\"aggType\":\"count\"}]}},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"range\",\"schema\":\"segment\",\"params\":{\"field\":\"LotArea\",\"ranges\":[{\"from\":0,\"to\":10000},{\"from\":10000,\"to\":20000},{\"from\":20000,\"to\":30000},{\"from\":40000,\"to\":50000},{\"from\":50000}]}}]}"},"id":"49eceef0-4b76-11ea-a505-e57bbdfb6038","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"61d12f10-4b74-11ea-a505-e57bbdfb6038","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-02-15T08:38:19.009Z","version":"WzEzLDJd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"language\":\"kuery\",\"query\":\"\"},\"filter\":[]}"},"optionsJSON":"{\"hidePanelTitles\":false,\"useMargins\":true}","panelsJSON":"[{\"embeddableConfig\":{},\"gridData\":{\"h\":15,\"i\":\"76c9e383-bdbb-4658-8795-118493af4ec3\",\"w\":24,\"x\":24,\"y\":0},\"panelIndex\":\"76c9e383-bdbb-4658-8795-118493af4ec3\",\"version\":\"7.5.1\",\"panelRefName\":\"panel_0\"},{\"embeddableConfig\":{},\"gridData\":{\"h\":15,\"i\":\"17e4efa7-9495-4df4-b7cf-1c3900042d8d\",\"w\":24,\"x\":0,\"y\":0},\"panelIndex\":\"17e4efa7-9495-4df4-b7cf-1c3900042d8d\",\"version\":\"7.5.1\",\"panelRefName\":\"panel_1\"}]","timeRestore":false,"title":"Example Inputs Dashboard","version":1},"id":"2daf01d0-4fcf-11ea-bad8-6dbf60384395","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"1d3afa50-4b76-11ea-a505-e57bbdfb6038","name":"panel_0","type":"visualization"},{"id":"49eceef0-4b76-11ea-a505-e57bbdfb6038","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-02-15T08:43:03.149Z","version":"WzIyLDJd"} +{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} \ No newline at end of file diff --git a/packages/ml_api/docker/logstash/pipeline/logstash.conf b/packages/ml_api/docker/logstash/pipeline/logstash.conf index 2123bd0..8cb6770 100644 --- a/packages/ml_api/docker/logstash/pipeline/logstash.conf +++ b/packages/ml_api/docker/logstash/pipeline/logstash.conf @@ -8,10 +8,19 @@ input { } output { - elasticsearch { - hosts => "elasticsearch:9200" - user => "elastic" - password => "changeme" - index => "webapp_logs-%{+YYYY.MM.dd}" + if [LotArea] { + elasticsearch { + hosts => "elasticsearch:9200" + user => "elastic" + password => "changeme" + index => "input_logs-%{+YYYY.MM.dd}" + } + } else { + elasticsearch { + hosts => "elasticsearch:9200" + user => "elastic" + password => "changeme" + index => "webapp_logs-%{+YYYY.MM.dd}" } -} + } +} \ No newline at end of file diff --git a/packages/ml_api/scripts/populate_database.py b/packages/ml_api/scripts/populate_database.py index fa12d88..1a40323 100644 --- a/packages/ml_api/scripts/populate_database.py +++ b/packages/ml_api/scripts/populate_database.py @@ -2,7 +2,7 @@ import os import time import typing as t -from random import randint +from random import randint, choice import pandas as pd import requests @@ -19,14 +19,21 @@ SECOND_FLR_SF_MAP = {"min": 0, "max": 1862} +BSMT_QUAL_VALUES = ('Gd', 'TA', 'Ex', 'Fa') + def _generate_random_int(value: int, value_ranges: t.Mapping) -> int: """Generate random integer within a min and max range.""" random_value = randint(value_ranges["min"], value_ranges["max"]) - return int(random_value) +def _select_random_category(value: str, value_options: t.Sequence) -> str: + """Select random category given a sequence of categories.""" + random_category = choice(value_options) + return random_category + + def _prepare_inputs(dataframe: pd.DataFrame) -> pd.DataFrame: """Prepare input data by removing key rows with NA values.""" clean_inputs_df = dataframe.dropna( @@ -43,6 +50,10 @@ def _prepare_inputs(dataframe: pd.DataFrame) -> pd.DataFrame: _generate_random_int, value_ranges=LOT_AREA_MAP ) + clean_inputs_df.loc[:, "BsmtQual"] = clean_inputs_df["BsmtQual"].apply( + _select_random_category, value_options=BSMT_QUAL_VALUES + ) + return clean_inputs_df @@ -74,6 +85,7 @@ def populate_database(n_predictions: int = 500, anomaly: bool = False) -> None: clean_inputs_df.loc[:, "OverallQual"] = 1 clean_inputs_df.loc[:, "GrLivArea"] = 1 + clean_inputs_df = clean_inputs_df.where(pd.notnull(clean_inputs_df), None) for index, data in clean_inputs_df.iterrows(): if index > n_predictions: if anomaly: diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini index 09a16a0..9dec932 100644 --- a/packages/ml_api/tox.ini +++ b/packages/ml_api/tox.ini @@ -98,7 +98,7 @@ passenv = setenv = PYTHONPATH=. - DB_HOST={env:DB_HOST:localhost} + DB_HOST=localhost commands = python scripts/populate_database.py {posargs} From 0a0f13139c0cbdc013d74d98d4b78040a8067223 Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sat, 15 Feb 2020 11:04:39 +0000 Subject: [PATCH 24/26] Not Part Of Course - Add CI --- .circleci/config.yml | 127 ++++++++++++++++++ .../test_requirements.txt | 3 + packages/gradient_boosting_model/tox.ini | 16 ++- packages/ml_api/tox.ini | 3 + 4 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..6525aa6 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,127 @@ +version: 2.1 + +jobs: + test_gradient_model_py36: + docker: + - image: circleci/python:3.6.9 + working_directory: ~/project/packages/gradient_boosting_model + steps: + - checkout: + path: ~/project + - run: + name: Run tests with Python 3.6 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py36 + test_gradient_model_py37: + docker: + - image: circleci/python:3.7.6 + working_directory: ~/project/packages/gradient_boosting_model + steps: + - checkout: + path: ~/project + - run: + name: Run tests with Python 3.7 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py37 + test_gradient_model_py38: + docker: + - image: circleci/python:3.8.0 + working_directory: ~/project/packages/gradient_boosting_model + steps: + - checkout: + path: ~/project + - run: + name: Run tests with Python 3.8 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py38 + test_ml_api_py36: + docker: + - image: circleci/python:3.6.9 + - image: postgres + environment: + POSTGRES_USER: test_user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_test + environment: + DB_HOST: localhost + DB_PORT: 5432 + DB_USER: test_user + DB_PASSWORD: password + DB_NAME: ml_api_test + SHADOW_MODE_ACTIVE: true + working_directory: ~/project/packages/ml_api + steps: + - checkout: + path: ~/project + - run: + name: Run API tests with Python 3.6 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py36 + test_ml_api_py37: + docker: + - image: circleci/python:3.7.6 + - image: postgres + environment: + POSTGRES_USER: test_user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_test + environment: + DB_HOST: localhost + DB_PORT: 5432 + DB_USER: test_user + DB_PASSWORD: password + DB_NAME: ml_api_test + SHADOW_MODE_ACTIVE: true + working_directory: ~/project/packages/ml_api + steps: + - checkout: + path: ~/project + - run: + name: Run API tests with Python 3.7 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py37 + test_ml_api_py38: + docker: + - image: circleci/python:3.8.1 + - image: postgres + environment: + POSTGRES_USER: test_user + POSTGRES_PASSWORD: password + POSTGRES_DB: ml_api_test + environment: + DB_HOST: localhost + DB_PORT: 5432 + DB_USER: test_user + DB_PASSWORD: password + DB_NAME: ml_api_test + SHADOW_MODE_ACTIVE: true + working_directory: ~/project/packages/ml_api + steps: + - checkout: + path: ~/project + - run: + name: Run API tests with Python 3.8 + command: | + sudo pip install --upgrade pip + pip install --user tox + tox -e py38 +workflows: + version: 2 + test-all: + jobs: + - test_gradient_model_py36 + - test_gradient_model_py37 + - test_gradient_model_py38 + - test_ml_api_py36 + - test_ml_api_py37 + - test_ml_api_py38 diff --git a/packages/gradient_boosting_model/test_requirements.txt b/packages/gradient_boosting_model/test_requirements.txt index 993846e..c44378e 100644 --- a/packages/gradient_boosting_model/test_requirements.txt +++ b/packages/gradient_boosting_model/test_requirements.txt @@ -11,3 +11,6 @@ tid-regression-model>=2.0.20,<2.1.0 black>=19.10b0,<20.0 flake8>=3.7.9,<4.0 mypy>=0.740 + +# kaggle cli +kaggle>=1.5.6,<1.6.0 diff --git a/packages/gradient_boosting_model/tox.ini b/packages/gradient_boosting_model/tox.ini index e898072..7c9059d 100644 --- a/packages/gradient_boosting_model/tox.ini +++ b/packages/gradient_boosting_model/tox.ini @@ -8,8 +8,22 @@ install_command = pip install {opts} {packages} deps = -rtest_requirements.txt +passenv = + KAGGLE_USERNAME + KAGGLE_KEY + +setenv = + PYTHONPATH=. + commands= - py.test + kaggle competitions download -c house-prices-advanced-regression-techniques -p gradient_boosting_model/datasets/ + unzip -o gradient_boosting_model/datasets/house-prices-advanced-regression-techniques.zip -d gradient_boosting_model/datasets + mv gradient_boosting_model/datasets/train.csv gradient_boosting_model/datasets/houseprice.csv + python gradient_boosting_model/train_pipeline.py + pytest \ + -s \ + -vv \ + {posargs:tests/} [testenv:unit_tests] diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini index 9dec932..031f45f 100644 --- a/packages/ml_api/tox.ini +++ b/packages/ml_api/tox.ini @@ -9,6 +9,9 @@ install_command = pip install {opts} {packages} deps = -rrequirements/test_requirements.txt +setenv = + PYTHONPATH=. + passenv = # A list of wildcard environment variable names which shall be copied from # the tox invocation environment to the test environment when executing test commands From c4c0bc8d857326cc10899be6fe7c5bb03586347c Mon Sep 17 00:00:00 2001 From: Christopher Samiullah Date: Sun, 16 Feb 2020 20:27:46 +0000 Subject: [PATCH 25/26] Update readme --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 95280ac..304b6fb 100644 --- a/README.md +++ b/README.md @@ -1,2 +1 @@ -# testing-and-monitoring-ml-deployments -WIP +Example project for the course "Testing & Monitoring Machine Learning Model Deployments". For setup instructions, see the course lectures. From 31ce1ccf3957812bc5ec0812155f817d9bc67c42 Mon Sep 17 00:00:00 2001 From: satishukadam Date: Thu, 5 Mar 2020 09:22:37 +0530 Subject: [PATCH 26/26] commited test.txt file --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test.txt diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..e69de29