3232import pathlib
3333import platform
3434import re
35+ from typing import Tuple
3536
36- from crc32c import crc32c
3737from binaryornot .check import is_binary
38+ from crc32c import crc32c
3839
3940from .scanossbase import ScanossBase
4041
@@ -157,7 +158,7 @@ class Winnowing(ScanossBase):
157158 a list of WFP fingerprints with their corresponding line numbers.
158159 """
159160
160- def __init__ (
161+ def __init__ ( # noqa: PLR0913
161162 self ,
162163 size_limit : bool = False ,
163164 debug : bool = False ,
@@ -197,6 +198,7 @@ def __init__(
197198 self .strip_hpsm_ids = strip_hpsm_ids
198199 self .strip_snippet_ids = strip_snippet_ids
199200 self .hpsm = hpsm
201+ self .is_windows = platform .system () == 'Windows'
200202 if hpsm :
201203 self .crc8_maxim_dow_table = []
202204 self .crc8_generate_table ()
@@ -218,11 +220,11 @@ def __normalize(byte):
218220 return byte
219221 if byte >= ASCII_a :
220222 return byte
221- if (byte >= 65 ) and (byte <= 90 ):
223+ if (byte >= ASCII_A ) and (byte <= ASCII_Z ):
222224 return byte + 32
223225 return 0
224226
225- def __skip_snippets (self , file : str , src : str ) -> bool :
227+ def __skip_snippets (self , file : str , src : str ) -> bool : # noqa: PLR0911
226228 """
227229 Determine files that are not of interest based on their content or file extension
228230 Parameters
@@ -351,7 +353,55 @@ def __strip_snippets(self, file: str, wfp: str) -> str:
351353 self .print_debug (f'Stripped snippet ids from { file } ' )
352354 return wfp
353355
354- def wfp_for_contents (self , file : str , bin_file : bool , contents : bytes ) -> str :
356+ def __detect_line_endings (self , contents : bytes ) -> Tuple [bool , bool , bool ]:
357+ """Detect the types of line endings present in file contents.
358+
359+ Args:
360+ contents: File contents as bytes.
361+
362+ Returns:
363+ Tuple of (has_crlf, has_lf_only, has_cr_only, has_mixed) indicating which line ending types are present.
364+ """
365+ has_crlf = b'\r \n ' in contents
366+ # For LF detection, we need to find LF that's not part of CRLF
367+ content_without_crlf = contents .replace (b'\r \n ' , b'' )
368+ has_standalone_lf = b'\n ' in content_without_crlf
369+ # For CR detection, we need to find CR that's not part of CRLF
370+ has_standalone_cr = b'\r ' in content_without_crlf
371+
372+ return has_crlf , has_standalone_lf , has_standalone_cr
373+
374+ def __calculate_opposite_line_ending_hash (self , contents : bytes ):
375+ """Calculate hash for contents with opposite line endings.
376+
377+ If the file is primarily Unix (LF), calculates Windows (CRLF) hash.
378+ If the file is primarily Windows (CRLF), calculates Unix (LF) hash.
379+
380+ Args:
381+ contents: File contents as bytes.
382+
383+ Returns:
384+ Hash with opposite line endings as hex string, or None if no line endings detected.
385+ """
386+ has_crlf , has_standalone_lf , has_standalone_cr = self .__detect_line_endings (contents )
387+
388+ if not has_crlf and not has_standalone_lf and not has_standalone_cr :
389+ return None
390+
391+ # Normalize all line endings to LF first
392+ normalized = contents .replace (b'\r \n ' , b'\n ' ).replace (b'\r ' , b'\n ' )
393+
394+ # Determine the dominant line ending type
395+ if has_crlf and not has_standalone_lf and not has_standalone_cr :
396+ # File is Windows (CRLF) - produce Unix (LF) hash
397+ opposite_contents = normalized
398+ else :
399+ # File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
400+ opposite_contents = normalized .replace (b'\n ' , b'\r \n ' )
401+
402+ return hashlib .md5 (opposite_contents ).hexdigest ()
403+
404+ def wfp_for_contents (self , file : str , bin_file : bool , contents : bytes ) -> str : # noqa: PLR0912, PLR0915
355405 """
356406 Generate a Winnowing fingerprint (WFP) for the given file contents
357407 Parameters
@@ -371,7 +421,7 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
371421 content_length = len (contents )
372422 original_filename = file
373423
374- if platform . system () == 'Windows' :
424+ if self . is_windows :
375425 original_filename = file .replace ('\\ ' , '/' )
376426 wfp_filename = repr (original_filename ).strip ("'" ) # return a utf-8 compatible version of the filename
377427 if self .obfuscate : # hide the real size of the file and its name, but keep the suffix
@@ -380,6 +430,13 @@ def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
380430 self .file_map [wfp_filename ] = original_filename # Save the file name map for later (reverse lookup)
381431
382432 wfp = 'file={0},{1},{2}\n ' .format (file_md5 , content_length , wfp_filename )
433+
434+ # Add opposite line ending hash based on line ending analysis
435+ if not bin_file :
436+ opposite_hash = self .__calculate_opposite_line_ending_hash (contents )
437+ if opposite_hash is not None :
438+ wfp += f'fh2={ opposite_hash } \n '
439+
383440 # We don't process snippets for binaries, or other uninteresting files, or if we're requested to skip
384441 if bin_file or self .skip_snippets or self .__skip_snippets (file , contents .decode ('utf-8' , 'ignore' )):
385442 return wfp
@@ -467,7 +524,7 @@ def calc_hpsm(self, content):
467524 for i , byte in enumerate (content ):
468525 c = byte
469526 if c == ASCII_LF : # When there is a new line
470- if len ( list_normalized ) :
527+ if list_normalized :
471528 crc_lines .append (self .crc8_buffer (list_normalized ))
472529 list_normalized = []
473530 elif last_line + 1 == i :
0 commit comments