Merge pull request #120 from scanoss/feature/mdaloia/SP-2655-SCANOSS.PY-Produce-extra-hash-for-windows-line-endings

matiasdaloia · web-flow · commit f36821703edc · 2025-06-10T17:37:17.000+02:00
[SP-2655] Produce extra hash for windows line endings
diff --git a/.github/workflows/python-local-test.yml b/.github/workflows/python-local-test.yml
@@ -39,9 +39,10 @@ jobs:
           retry_wait_seconds: 10
           max_attempts: 3
           retry_on: error
+          shell: bash
           command: |
             pip install -r requirements.txt
-            pip install dist/scanoss-*-py3-none-any.whl
+            pip install dist/scanoss-*.whl
             which scanoss-py
 
       - name: Run Tests
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Upcoming changes...
 
+## [1.25.0] - 2025-06-10
+### Added
+- Add `fh2` hash while fingerprinting mixed line ending files
+### Modified
+- Updated `inspect` debug/warning statements
+
 ## [1.24.0] - 2025-05-28
 ### Added
 - Add `crypto` subcommand to retrieve cryptographic algorithms for the given components
@@ -522,4 +528,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 [1.21.0]: https://github.com/scanoss/scanoss.py/compare/v1.20.6...v1.21.0
 [1.22.0]: https://github.com/scanoss/scanoss.py/compare/v1.21.0...v1.22.0
 [1.23.0]: https://github.com/scanoss/scanoss.py/compare/v1.22.0...v1.23.0
-[1.24.0]: https://github.com/scanoss/scanoss.py/compare/v1.23.0...v1.24.0
+[1.24.0]: https://github.com/scanoss/scanoss.py/compare/v1.23.0...v1.24.0
+[1.25.0]: https://github.com/scanoss/scanoss.py/compare/v1.24.0...v1.25.0
diff --git a/src/scanoss/__init__.py b/src/scanoss/__init__.py
@@ -22,4 +22,4 @@
   THE SOFTWARE.
 """
 
-__version__ = '1.24.0'
+__version__ = '1.25.0'
diff --git a/src/scanoss/winnowing.py b/src/scanoss/winnowing.py
@@ -32,9 +32,10 @@
 import pathlib
 import platform
 import re
+from typing import Tuple
 
-from crc32c import crc32c
 from binaryornot.check import is_binary
+from crc32c import crc32c
 
 from .scanossbase import ScanossBase
 
@@ -157,7 +158,7 @@ class Winnowing(ScanossBase):
     a list of WFP fingerprints with their corresponding line numbers.
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         size_limit: bool = False,
         debug: bool = False,
@@ -197,6 +198,7 @@ def __init__(
         self.strip_hpsm_ids = strip_hpsm_ids
         self.strip_snippet_ids = strip_snippet_ids
         self.hpsm = hpsm
+        self.is_windows = platform.system() == 'Windows'
         if hpsm:
             self.crc8_maxim_dow_table = []
             self.crc8_generate_table()
@@ -218,11 +220,11 @@ def __normalize(byte):
             return byte
         if byte >= ASCII_a:
             return byte
-        if (byte >= 65) and (byte <= 90):
+        if (byte >= ASCII_A) and (byte <= ASCII_Z):
             return byte + 32
         return 0
 
-    def __skip_snippets(self, file: str, src: str) -> bool:
+    def __skip_snippets(self, file: str, src: str) -> bool:  # noqa: PLR0911
         """
         Determine files that are not of interest based on their content or file extension
         Parameters
@@ -351,7 +353,55 @@ def __strip_snippets(self, file: str, wfp: str) -> str:
             self.print_debug(f'Stripped snippet ids from {file}')
         return wfp
 
-    def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
+    def __detect_line_endings(self, contents: bytes) -> Tuple[bool, bool, bool]:
+        """Detect the types of line endings present in file contents.
+
+        Args:
+            contents: File contents as bytes.
+
+        Returns:
+            Tuple of (has_crlf, has_lf_only, has_cr_only, has_mixed) indicating which line ending types are present.
+        """
+        has_crlf = b'\r\n' in contents
+        # For LF detection, we need to find LF that's not part of CRLF
+        content_without_crlf = contents.replace(b'\r\n', b'')
+        has_standalone_lf = b'\n' in content_without_crlf
+        # For CR detection, we need to find CR that's not part of CRLF
+        has_standalone_cr = b'\r' in content_without_crlf
+
+        return has_crlf, has_standalone_lf, has_standalone_cr
+
+    def __calculate_opposite_line_ending_hash(self, contents: bytes):
+        """Calculate hash for contents with opposite line endings.
+
+        If the file is primarily Unix (LF), calculates Windows (CRLF) hash.
+        If the file is primarily Windows (CRLF), calculates Unix (LF) hash.
+
+        Args:
+            contents: File contents as bytes.
+
+        Returns:
+            Hash with opposite line endings as hex string, or None if no line endings detected.
+        """
+        has_crlf, has_standalone_lf, has_standalone_cr = self.__detect_line_endings(contents)
+
+        if not has_crlf and not has_standalone_lf and not has_standalone_cr:
+            return None
+
+        # Normalize all line endings to LF first
+        normalized = contents.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
+
+        # Determine the dominant line ending type
+        if has_crlf and not has_standalone_lf and not has_standalone_cr:
+            # File is Windows (CRLF) - produce Unix (LF) hash
+            opposite_contents = normalized
+        else:
+            # File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
+            opposite_contents = normalized.replace(b'\n', b'\r\n')
+
+        return hashlib.md5(opposite_contents).hexdigest()
+
+    def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:  # noqa: PLR0912, PLR0915
         """
         Generate a Winnowing fingerprint (WFP) for the given file contents
         Parameters
@@ -371,7 +421,7 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
         content_length = len(contents)
         original_filename = file
 
-        if platform.system() == 'Windows':
+        if self.is_windows:
             original_filename = file.replace('\\', '/')
         wfp_filename = repr(original_filename).strip("'")  # return a utf-8 compatible version of the filename
         if self.obfuscate:  # hide the real size of the file and its name, but keep the suffix
@@ -380,6 +430,13 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
             self.file_map[wfp_filename] = original_filename  # Save the file name map for later (reverse lookup)
 
         wfp = 'file={0},{1},{2}\n'.format(file_md5, content_length, wfp_filename)
+
+        # Add opposite line ending hash based on line ending analysis
+        if not bin_file:
+            opposite_hash = self.__calculate_opposite_line_ending_hash(contents)
+            if opposite_hash is not None:
+                wfp += f'fh2={opposite_hash}\n'
+
         # We don't process snippets for binaries, or other uninteresting files, or if we're requested to skip
         if bin_file or self.skip_snippets or self.__skip_snippets(file, contents.decode('utf-8', 'ignore')):
             return wfp
@@ -467,7 +524,7 @@ def calc_hpsm(self, content):
         for i, byte in enumerate(content):
             c = byte
             if c == ASCII_LF:  # When there is a new line
-                if len(list_normalized):
+                if list_normalized:
                     crc_lines.append(self.crc8_buffer(list_normalized))
                     list_normalized = []
                 elif last_line + 1 == i:
diff --git a/tests/test_winnowing.py b/tests/test_winnowing.py