Skip to content

Commit 5b07301

Browse files
reimport: add test for internal duplicates during matching
1 parent ee9ee74 commit 5b07301

File tree

1 file changed

+113
-1
lines changed

1 file changed

+113
-1
lines changed

unittests/test_import_reimport.py

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from django.urls import reverse
1212
from django.utils import timezone
1313

14-
from dojo.models import Finding, Test, Test_Type, User
14+
from dojo.models import Engagement, Finding, Product, Product_Type, Test, Test_Type, User
1515

1616
from .dojo_test_case import DojoAPITestCase, get_unit_tests_scans_path
1717
from .test_utils import assertTestImportModelsCreated
@@ -2033,6 +2033,118 @@ def test_reimport_set_scan_date_parser_sets_date(self):
20332033
date = findings["results"][0]["date"]
20342034
self.assertEqual(date, "2006-12-26")
20352035

2036+
@override_settings(
2037+
IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=50,
2038+
IMPORT_REIMPORT_MATCH_BATCH_SIZE=50,
2039+
)
2040+
def test_reimport_with_internal_duplicates_same_batch(self):
2041+
"""
2042+
Test that duplicates within the same scan report are detected during reimport
2043+
even when they're processed in the same batch.
2044+
2045+
This test mimics the integration test test_import_path_tests in tests/dedupe_test.py,
2046+
which tests the same scenario but uses Selenium UI testing.
2047+
2048+
BACKGROUND:
2049+
When reimporting a scan report, the system uses batch processing to optimize
2050+
database queries. Findings are processed in batches (default: 1000), and for
2051+
each batch, candidate matches are fetched from the database once. However, this
2052+
creates a bug: if multiple duplicate findings exist within the same scan report
2053+
and they're processed in the same batch, they cannot match each other because
2054+
the candidate dictionary is built before any findings in the batch are saved.
2055+
2056+
THE BUG:
2057+
- Finding #1 in batch: No matches found → Creates new finding
2058+
- Finding #2 in batch: No matches found (Finding #1 not in DB yet) → Creates duplicate
2059+
- Finding #3 in batch: No matches found (Findings #1 and #2 not in DB yet) → Creates duplicate
2060+
Result: 3 findings created instead of 1 unique finding
2061+
2062+
THIS TEST:
2063+
This test reproduces the bug by:
2064+
1. Creating an empty test (no existing findings to match against)
2065+
2. Reimporting dedupe_path_1.json which contains 3 duplicate findings:
2066+
- All have same file_path: "/opt/appsecpipeline/source/dojo/cred/views.py"
2067+
- All have same line_number: 524
2068+
- All have same test_id: "B110"
2069+
- All should have the same hash_code and match each other
2070+
3. Using small batch size (50) to ensure all 3 duplicates are in the same batch
2071+
4. Verifying that only 1 finding is created (not 3)
2072+
2073+
EXPECTED BEHAVIOR:
2074+
- Total findings: 1 (the 3 duplicates should match to 1 unique finding)
2075+
- Active findings: 1
2076+
- Duplicate findings: 0 (duplicates within report should match, not be marked as duplicates)
2077+
2078+
CURRENT BUGGY BEHAVIOR:
2079+
- Total findings: 3 (all duplicates are created because they can't match each other)
2080+
- Active findings: 1 (first one)
2081+
- Duplicate findings: 2 (second and third are marked as duplicates of first)
2082+
2083+
This test should fail with the current buggy implementation and pass after
2084+
implementing intra-batch duplicate tracking.
2085+
"""
2086+
# Use the dedupe_path_1.json file from integration tests (has 3 duplicate findings)
2087+
# Note: This file is in tests/dedupe_scans, not unittests/scans
2088+
dedupe_path_file = Path(__file__).parent.parent / "tests" / "dedupe_scans" / "dedupe_path_1.json"
2089+
2090+
# Create engagement and test
2091+
product_type, _ = Product_Type.objects.get_or_create(name="PT Bandit Internal Dupes")
2092+
product, _ = Product.objects.get_or_create(name="P Bandit Internal Dupes", prod_type=product_type)
2093+
engagement = Engagement.objects.create(
2094+
name="E Bandit Internal Dupes",
2095+
product=product,
2096+
target_start=timezone.now(),
2097+
target_end=timezone.now(),
2098+
)
2099+
engagement.deduplication_on_engagement = True
2100+
engagement.save()
2101+
2102+
# Create an empty test first
2103+
# Get or create the test type
2104+
test_type, _ = Test_Type.objects.get_or_create(name="Bandit Scan")
2105+
test = Test.objects.create(
2106+
engagement=engagement,
2107+
test_type=test_type,
2108+
title="Path Test 1",
2109+
target_start=timezone.now(),
2110+
target_end=timezone.now(),
2111+
)
2112+
test_id = test.id
2113+
2114+
# Now reimport the file with duplicates to the empty test
2115+
# This tests reimport batch matching where duplicates within the same report need to match each other
2116+
reimport_result = self.reimport_scan_with_params(
2117+
test_id,
2118+
dedupe_path_file,
2119+
scan_type="Bandit Scan",
2120+
)
2121+
2122+
test_id = reimport_result["test"]
2123+
2124+
# Verify findings count
2125+
# dedupe_path_1.json has 3 findings that are duplicates (same file_path, line_number, test_id)
2126+
# They should all match each other, resulting in 1 unique finding
2127+
# However, if intra-batch matching fails, we'll get 3 findings created instead of 1
2128+
total_findings = Finding.objects.filter(test_id=test_id).count()
2129+
active_findings = Finding.objects.filter(test_id=test_id, active=True, duplicate=False).count()
2130+
duplicate_findings = Finding.objects.filter(test_id=test_id, duplicate=True).count()
2131+
2132+
# Expected: 1 active finding (the 3 duplicates should match each other)
2133+
# If intra-batch matching fails, we'll get 3 findings instead
2134+
with self.subTest(metric="total_findings"):
2135+
self.assertEqual(total_findings, 1,
2136+
f"Expected 1 total finding (3 duplicates should match to 1), got {total_findings}. "
2137+
f"If this is 3, intra-batch duplicates weren't detected.")
2138+
2139+
with self.subTest(metric="active_findings"):
2140+
self.assertEqual(active_findings, 1,
2141+
f"Expected 1 active finding, got {active_findings}. "
2142+
f"If this fails, intra-batch duplicates weren't detected.")
2143+
2144+
with self.subTest(metric="duplicate_findings"):
2145+
self.assertEqual(duplicate_findings, 0,
2146+
f"Expected 0 duplicate findings (duplicates within report should match), got {duplicate_findings}")
2147+
20362148

20372149
class ImportReimportTestUI(DojoAPITestCase, ImportReimportMixin):
20382150
fixtures = ["dojo_testdata.json"]

0 commit comments

Comments
 (0)