Skip to content

Commit 25e95f9

Browse files
authored
Merge pull request #4 from ol1g3/utils-and-pipelines
Added Tests and CI Pipeline
2 parents 2db2b9d + d05a026 commit 25e95f9

File tree

9 files changed

+312
-9
lines changed

9 files changed

+312
-9
lines changed

.github/workflows/ci.yml

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
name: CI Pipeline
2+
3+
on:
4+
push
5+
6+
jobs:
7+
build:
8+
runs-on: ubuntu-22.04
9+
steps:
10+
- uses: actions/checkout@v3
11+
12+
- name: Set up Python
13+
uses: actions/setup-python@v4
14+
with:
15+
python-version: 3.11
16+
cache: 'pip'
17+
18+
- name: Install dependencies
19+
run: |
20+
python -m venv .venv
21+
source .venv/bin/activate
22+
pip install -r requirements.txt
23+
24+
- name: Build project
25+
run: |
26+
source .venv/bin/activate
27+
python -m compileall .
28+
29+
lint:
30+
runs-on: ubuntu-22.04
31+
needs: build
32+
steps:
33+
- uses: actions/checkout@v3
34+
35+
- name: Set up Python
36+
uses: actions/setup-python@v4
37+
with:
38+
python-version: 3.11
39+
cache: 'pip'
40+
41+
- name: Install linter dependencies
42+
run: |
43+
python -m venv .venv
44+
source .venv/bin/activate
45+
pip install ruff
46+
47+
- name: Lint code
48+
run: |
49+
source .venv/bin/activate
50+
ruff check . \
51+
--exclude=.venv \
52+
--force-exclude \
53+
--select=E9,F63,F7,F82 \
54+
--output-format=full
55+
test:
56+
runs-on: ubuntu-22.04\
57+
needs: build
58+
steps:
59+
- uses: actions/checkout@v3
60+
61+
- name: Set up Python
62+
uses: actions/setup-python@v4
63+
with:
64+
python-version: 3.11
65+
cache: pip
66+
67+
- name: Install dependencies
68+
run: |
69+
python -m venv .venv
70+
source .venv/bin/activate
71+
pip install -r requirements.txt
72+
73+
- name: Run data loader & preprocessor tests
74+
env:
75+
PYTHONPATH: ${{ github.workspace }}
76+
run: |
77+
source .venv/bin/activate
78+
python tests/test_data_loader.py
79+
80+
- name: Run model tests
81+
env:
82+
PYTHONPATH: ${{ github.workspace }}
83+
run: |
84+
source .venv/bin/activate
85+
python tests/test_model.py

README.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,49 @@
11
# fraud-detection-ML-pipeline
22
Fraud Detection System using Machine Learning, with a complete CI/CT/CD pipeline
33

4+
## Installation
5+
6+
1. **Clone the Repository:**
7+
```bash
8+
git clone https://github.com/ol1g3/fraud-detection-ML-pipeline.git
9+
cd fraud-detection-ML-pipeline
10+
```
11+
2. **Set Up the Virtual Environment:** Choose one of the methods below.
12+
13+
## Virtual Environment Setup Options
14+
15+
### Using venv (default)
16+
17+
1. **Create the Virtual Environment (Mac):**
18+
```bash
19+
python3 -m venv .venv
20+
```
21+
2. **Activate the Virtual Environment:**
22+
```bash
23+
source .venv/bin/activate
24+
pip install -r requirements.txt
25+
```
26+
3. **Deactivate the Virtual Environment (when done):**
27+
```bash
28+
deactivate
29+
```
30+
31+
### Using uv (alternative)
32+
33+
1. **Create the Virtual Environment:**
34+
```bash
35+
uv venv --python 3.11
36+
```
37+
2. **Activate the Virtual Environment And Install Requirements:**
38+
```bash
39+
source .venv/bin/activate
40+
uv pip install -r requirements.txt
41+
```
42+
3. **Deactivate the Virtual Environment (when done):**
43+
```bash
44+
deactivate
45+
```
46+
47+
## Data Source
48+
49+
The dataset used for this project can be found on Kaggle: [Fraud Detection Dataset](https://www.kaggle.com/datasets/kartik2112/fraud-detection).

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ pandas>=2.2.3
44
scikit-learn>=1.6.1
55
torch>=2.6.0
66
torchvision>=0.21.0
7-
typing>=3.10.0.0
7+
typing>=3.7.0

src/__init__.py

Whitespace-only changes.

src/data/loader.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,22 +64,22 @@ def load_data(
6464
self.test_data = self.load_csv(test_filename)
6565

6666
def train_valid_split(
67-
self, valid_size: float = 0.15
67+
self, train_size: float = 0.15
6868
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
6969
"""
7070
Split dataset into train, validation, and test sets.
7171
7272
Args:
7373
df: DataFrame to split
74-
valid_size: Fraction of data for validation
74+
train_size: Fraction of data for test
7575
7676
Returns:
7777
tuple of (train, validation, test) DataFrames
7878
"""
7979

8080
valid_df, test_df = train_test_split(
8181
self.test_data,
82-
test_size=valid_size,
82+
train_size=train_size,
8383
random_state=42,
8484
)
8585

src/data/preprocessor.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,11 @@ def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
6565
# Categorical columns: fill with most frequent value
6666
for col in self.categorical_features:
6767
if col in df and df[col].isnull().sum() > 0:
68-
df[col] = df[col].fillna(df[col].mode()[0])
68+
mode_values = df[col].mode()
69+
if not mode_values.empty:
70+
df[col] = df[col].fillna(mode_values[0])
71+
else:
72+
df[col] = df[col].fillna("")
6973

7074
return df
7175

@@ -216,6 +220,8 @@ def transform(self, df: pd.DataFrame) -> tuple[np.ndarray, Optional[np.ndarray]]
216220
else:
217221
X = np.array([])
218222

223+
if len(X[0]) > 1:
224+
X = X[:, :-1]
219225
return X, y
220226

221227
def fit_transform(

src/utils/logger.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44

55
class PipelineLogger(logging.Logger):
66
def __init__(self, name, log_file=None, level=logging.INFO):
7-
self.logger = logging.getLogger(name)
8-
self.logger.setLevel(level)
7+
super().__init__(name, level)
98

109
# Configure handlers based on parameters
1110
self._setup_handlers(log_file, level)
@@ -26,6 +25,6 @@ def _setup_handlers(self, log_file, level):
2625
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
2726
)
2827
f_handler.setFormatter(f_format)
29-
self.logger.addHandler(f_handler)
28+
self.addHandler(f_handler)
3029

31-
self.logger.addHandler(c_handler)
30+
self.addHandler(c_handler)

tests/test_data_loader.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import sys
2+
import os
3+
import tempfile
4+
5+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")))
6+
7+
import pandas as pd
8+
from src.data.loader import DataLoader
9+
from src.data.preprocessor import DataPreprocessor
10+
11+
12+
def test_data_loader_real():
13+
orig_train = os.path.join("data", "processed", "train.csv")
14+
orig_test = os.path.join("data", "processed", "test.csv")
15+
16+
df_train = pd.read_csv(orig_train, nrows=200)
17+
df_test = pd.read_csv(orig_test, nrows=100)
18+
19+
with tempfile.TemporaryDirectory() as tmp:
20+
df_train.to_csv(os.path.join(tmp, "train.csv"), index=False)
21+
df_test.to_csv(os.path.join(tmp, "test.csv"), index=False)
22+
23+
loader = DataLoader(data_dir=tmp)
24+
loader.load_data("train.csv", "test.csv")
25+
train, valid, test = loader.train_valid_split(0.2)
26+
27+
# Train dataset should remain the same, valid and test should be split
28+
assert train.shape[0] == 200
29+
assert valid.shape[0] == 20
30+
assert test.shape[0] == 80
31+
32+
33+
def detect_date_columns(df):
34+
date_columns = []
35+
for col in df.columns:
36+
try:
37+
temp_series = pd.to_datetime(df[col], errors="coerce")
38+
if temp_series.notna().any():
39+
date_columns.append(col)
40+
except Exception:
41+
pass
42+
return date_columns
43+
44+
45+
def test_preprocessor():
46+
47+
d = DataLoader("data/processed")
48+
49+
d.load_data("train.csv", "test.csv")
50+
51+
train, val, test = d.train_valid_split(0.2)
52+
53+
date_columns = detect_date_columns(train.copy())
54+
55+
categorical = [
56+
col
57+
for col in train.select_dtypes(include=["object"]).columns
58+
if col not in date_columns
59+
]
60+
numerical = train.select_dtypes(include=["float64", "int64"]).columns
61+
62+
p = DataPreprocessor(categorical, numerical, "is_fraud", date_columns)
63+
64+
p = p.fit(train)
65+
X, y = p.transform(train)
66+
67+
assert X.shape[0] == train.shape[0]
68+
assert y.shape[0] == train.shape[0]
69+
70+
assert X.shape[1] == 9
71+
72+
73+
def main():
74+
try:
75+
test_data_loader_real()
76+
except Exception as e:
77+
print(f"Test failed: {e}", file=sys.stderr)
78+
sys.exit(1)
79+
80+
try:
81+
test_preprocessor()
82+
except Exception as e:
83+
print(f"Test failed: {e}", file=sys.stderr)
84+
sys.exit(1)
85+
86+
print("All DataLoader & Preprocessor tests passed.")
87+
sys.exit(0)
88+
89+
90+
if __name__ == "__main__":
91+
main()

tests/test_model.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import sys
2+
import os
3+
4+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")))
5+
6+
import pandas as pd
7+
from src.data.loader import DataLoader
8+
from src.data.preprocessor import DataPreprocessor
9+
from src.model.model import FraudDetectionModel
10+
import numpy as np
11+
12+
13+
def detect_date_columns(df):
14+
date_columns = []
15+
for col in df.columns:
16+
try:
17+
temp_series = pd.to_datetime(df[col], errors="coerce")
18+
if temp_series.notna().any():
19+
date_columns.append(col)
20+
except Exception:
21+
pass
22+
return date_columns
23+
24+
25+
def test_model() -> float:
26+
27+
d = DataLoader("data/processed")
28+
29+
d.load_data("train.csv", "test.csv")
30+
31+
train, val, test = d.train_valid_split(0.2)
32+
33+
date_columns = detect_date_columns(train.copy())
34+
35+
categorical = [
36+
col
37+
for col in train.select_dtypes(include=["object"]).columns
38+
if col not in date_columns
39+
]
40+
numerical = train.select_dtypes(include=["float64", "int64"]).columns
41+
42+
p = DataPreprocessor(categorical, numerical, "is_fraud", date_columns)
43+
44+
p = p.fit(train)
45+
X, y = p.transform(train)
46+
y = np.array(y).astype(np.float64)
47+
48+
fd = FraudDetectionModel(
49+
X.shape[1],
50+
)
51+
52+
fd.train(X, y)
53+
ans = fd.evaluate(X, y)
54+
55+
return ans["accuracy"]
56+
57+
58+
def main():
59+
accuracy = -1
60+
try:
61+
accuracy = test_model()
62+
except Exception as e:
63+
print(f"Test failed: {e}", file=sys.stderr)
64+
sys.exit(1)
65+
66+
# Regression test
67+
if accuracy < 0.8:
68+
print("Model accuracy too low:", accuracy, file=sys.stderr)
69+
sys.exit(1)
70+
71+
print("All model tests passed.")
72+
sys.exit(0)
73+
74+
75+
if __name__ == "__main__":
76+
main()

0 commit comments

Comments
 (0)