Merge pull request #4 from ol1g3/utils-and-pipelines

ol1g3 · web-flow · commit 25e95f92d7f4 · 2025-04-18T00:06:28.000+02:00
Added Tests and CI Pipeline
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,85 @@
+name: CI Pipeline
+
+on:
+  push
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+            python-version: 3.11
+            cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+            python -m venv .venv
+            source .venv/bin/activate
+            pip install -r requirements.txt
+
+      - name: Build project
+        run: |
+            source .venv/bin/activate
+            python -m compileall .
+
+  lint:
+    runs-on: ubuntu-22.04
+    needs: build
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+            python-version: 3.11
+            cache: 'pip'
+
+      - name: Install linter dependencies
+        run: |
+            python -m venv .venv
+            source .venv/bin/activate
+            pip install ruff
+
+      - name: Lint code
+        run: |
+            source .venv/bin/activate
+            ruff check . \
+            --exclude=.venv \
+            --force-exclude \
+            --select=E9,F63,F7,F82 \
+            --output-format=full
+  test:
+    runs-on: ubuntu-22.04\
+    needs: build
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+            python-version: 3.11
+            cache: pip
+
+      - name: Install dependencies
+        run: |
+            python -m venv .venv
+            source .venv/bin/activate
+            pip install -r requirements.txt
+
+      - name: Run data loader & preprocessor tests
+        env:
+            PYTHONPATH: ${{ github.workspace }}
+        run: |
+            source .venv/bin/activate
+            python tests/test_data_loader.py
+
+      - name: Run model tests
+        env:
+            PYTHONPATH: ${{ github.workspace }}
+        run: |
+            source .venv/bin/activate
+            python tests/test_model.py
diff --git a/README.md b/README.md
@@ -1,3 +1,49 @@
 # fraud-detection-ML-pipeline
 Fraud Detection System using Machine Learning, with a complete CI/CT/CD pipeline
 
+## Installation
+
+1. **Clone the Repository:**
+    ```bash
+    git clone https://github.com/ol1g3/fraud-detection-ML-pipeline.git
+    cd fraud-detection-ML-pipeline
+    ```
+2. **Set Up the Virtual Environment:** Choose one of the methods below.
+
+## Virtual Environment Setup Options
+
+### Using venv (default)
+
+1. **Create the Virtual Environment (Mac):**
+    ```bash
+    python3 -m venv .venv
+    ```
+2. **Activate the Virtual Environment:**
+    ```bash
+    source .venv/bin/activate
+    pip install -r requirements.txt
+    ```
+3. **Deactivate the Virtual Environment (when done):**
+    ```bash
+    deactivate
+    ```
+
+### Using uv (alternative)
+
+1. **Create the Virtual Environment:**
+    ```bash
+    uv venv --python 3.11
+    ```
+2. **Activate the Virtual Environment And Install Requirements:**
+    ```bash
+    source .venv/bin/activate
+    uv pip install -r requirements.txt
+    ```
+3. **Deactivate the Virtual Environment (when done):**
+    ```bash
+    deactivate
+    ```
+
+## Data Source
+
+The dataset used for this project can be found on Kaggle: [Fraud Detection Dataset](https://www.kaggle.com/datasets/kartik2112/fraud-detection).
diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,4 @@ pandas>=2.2.3
 scikit-learn>=1.6.1
 torch>=2.6.0
 torchvision>=0.21.0
-typing>=3.10.0.0
+typing>=3.7.0
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/data/loader.py b/src/data/loader.py
@@ -64,22 +64,22 @@ def load_data(
         self.test_data = self.load_csv(test_filename)
 
     def train_valid_split(
-        self, valid_size: float = 0.15
+        self, train_size: float = 0.15
     ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """
         Split dataset into train, validation, and test sets.
 
         Args:
             df: DataFrame to split
-            valid_size: Fraction of data for validation
+            train_size: Fraction of data for test
 
         Returns:
             tuple of (train, validation, test) DataFrames
         """
 
         valid_df, test_df = train_test_split(
             self.test_data,
-            test_size=valid_size,
+            train_size=train_size,
             random_state=42,
         )
 
diff --git a/src/data/preprocessor.py b/src/data/preprocessor.py
@@ -65,7 +65,11 @@ def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
         # Categorical columns: fill with most frequent value
         for col in self.categorical_features:
             if col in df and df[col].isnull().sum() > 0:
-                df[col] = df[col].fillna(df[col].mode()[0])
+                mode_values = df[col].mode()
+                if not mode_values.empty:
+                    df[col] = df[col].fillna(mode_values[0])
+                else:
+                    df[col] = df[col].fillna("")
 
         return df
 
@@ -216,6 +220,8 @@ def transform(self, df: pd.DataFrame) -> tuple[np.ndarray, Optional[np.ndarray]]
         else:
             X = np.array([])
 
+        if len(X[0]) > 1:
+            X = X[:, :-1]
         return X, y
 
     def fit_transform(
diff --git a/src/utils/logger.py b/src/utils/logger.py
@@ -4,8 +4,7 @@
 
 class PipelineLogger(logging.Logger):
     def __init__(self, name, log_file=None, level=logging.INFO):
-        self.logger = logging.getLogger(name)
-        self.logger.setLevel(level)
+        super().__init__(name, level)
 
         # Configure handlers based on parameters
         self._setup_handlers(log_file, level)
@@ -26,6 +25,6 @@ def _setup_handlers(self, log_file, level):
                 "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
             )
             f_handler.setFormatter(f_format)
-            self.logger.addHandler(f_handler)
+            self.addHandler(f_handler)
 
-        self.logger.addHandler(c_handler)
+        self.addHandler(c_handler)
diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py
@@ -0,0 +1,91 @@
+import sys
+import os
+import tempfile
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")))
+
+import pandas as pd
+from src.data.loader import DataLoader
+from src.data.preprocessor import DataPreprocessor
+
+
+def test_data_loader_real():
+    orig_train = os.path.join("data", "processed", "train.csv")
+    orig_test = os.path.join("data", "processed", "test.csv")
+
+    df_train = pd.read_csv(orig_train, nrows=200)
+    df_test = pd.read_csv(orig_test, nrows=100)
+
+    with tempfile.TemporaryDirectory() as tmp:
+        df_train.to_csv(os.path.join(tmp, "train.csv"), index=False)
+        df_test.to_csv(os.path.join(tmp, "test.csv"), index=False)
+
+        loader = DataLoader(data_dir=tmp)
+        loader.load_data("train.csv", "test.csv")
+        train, valid, test = loader.train_valid_split(0.2)
+
+        # Train dataset should remain the same, valid and test should be split
+        assert train.shape[0] == 200
+        assert valid.shape[0] == 20
+        assert test.shape[0] == 80
+
+
+def detect_date_columns(df):
+    date_columns = []
+    for col in df.columns:
+        try:
+            temp_series = pd.to_datetime(df[col], errors="coerce")
+            if temp_series.notna().any():
+                date_columns.append(col)
+        except Exception:
+            pass
+    return date_columns
+
+
+def test_preprocessor():
+
+    d = DataLoader("data/processed")
+
+    d.load_data("train.csv", "test.csv")
+
+    train, val, test = d.train_valid_split(0.2)
+
+    date_columns = detect_date_columns(train.copy())
+
+    categorical = [
+        col
+        for col in train.select_dtypes(include=["object"]).columns
+        if col not in date_columns
+    ]
+    numerical = train.select_dtypes(include=["float64", "int64"]).columns
+
+    p = DataPreprocessor(categorical, numerical, "is_fraud", date_columns)
+
+    p = p.fit(train)
+    X, y = p.transform(train)
+
+    assert X.shape[0] == train.shape[0]
+    assert y.shape[0] == train.shape[0]
+
+    assert X.shape[1] == 9
+
+
+def main():
+    try:
+        test_data_loader_real()
+    except Exception as e:
+        print(f"Test failed: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        test_preprocessor()
+    except Exception as e:
+        print(f"Test failed: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    print("All DataLoader & Preprocessor tests passed.")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -0,0 +1,76 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")))
+
+import pandas as pd
+from src.data.loader import DataLoader
+from src.data.preprocessor import DataPreprocessor
+from src.model.model import FraudDetectionModel
+import numpy as np
+
+
+def detect_date_columns(df):
+    date_columns = []
+    for col in df.columns:
+        try:
+            temp_series = pd.to_datetime(df[col], errors="coerce")
+            if temp_series.notna().any():
+                date_columns.append(col)
+        except Exception:
+            pass
+    return date_columns
+
+
+def test_model() -> float:
+
+    d = DataLoader("data/processed")
+
+    d.load_data("train.csv", "test.csv")
+
+    train, val, test = d.train_valid_split(0.2)
+
+    date_columns = detect_date_columns(train.copy())
+
+    categorical = [
+        col
+        for col in train.select_dtypes(include=["object"]).columns
+        if col not in date_columns
+    ]
+    numerical = train.select_dtypes(include=["float64", "int64"]).columns
+
+    p = DataPreprocessor(categorical, numerical, "is_fraud", date_columns)
+
+    p = p.fit(train)
+    X, y = p.transform(train)
+    y = np.array(y).astype(np.float64)
+
+    fd = FraudDetectionModel(
+        X.shape[1],
+    )
+
+    fd.train(X, y)
+    ans = fd.evaluate(X, y)
+
+    return ans["accuracy"]
+
+
+def main():
+    accuracy = -1
+    try:
+        accuracy = test_model()
+    except Exception as e:
+        print(f"Test failed: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    # Regression test
+    if accuracy < 0.8:
+        print("Model accuracy too low:", accuracy, file=sys.stderr)
+        sys.exit(1)
+
+    print("All model tests passed.")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()