intervals-mining-lab · goruha · Sep 22, 2024 · Sep 24, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/src/foapy/alphabet.py b/src/foapy/alphabet.py
@@ -43,20 +43,51 @@ def alphabet(X) -> np.ndarray:
     >>> result
     Exception
     """
+    # ex.:
+    # data = ['a', 'c', 'c', 'e', 'd', 'a']
     data = np.asanyarray(X)
     if data.ndim > 1:  # Checking for d1 array
         raise Not1DArrayException(
             {"message": f"Incorrect array form. Expected d1 array, exists {data.ndim}"}
         )
-
+    # Sort data positions
-    # Sort data positions
+    # Indices that would sort data array
-    # Sort data positions
+    # Indices that would sort data array
+    # ex.:
+    #         a  a  c  c  d  e
+    # perm = [0, 5, 1, 2, 4, 3]
     perm = data.argsort(kind="mergesort")
 
-    mask_shape = data.shape
-    unique_mask = np.empty(mask_shape, dtype=bool)
+    # Create tmp mask array to store True on positions where appears new value
+    # ex.:
+    #              a  a  c  c  d  e
+    # perm      = [0, 5, 1, 2, 4, 3]
+    # perm[1:]  = [   5, 1, 2, 4, 3]
+    # perm[:-1] = [0, 5, 1, 2, 4   ]
+
+    # data[perm[1:]]                    = [        'a',  'c',  'c',  'd',  'e']
+    # data[perm[:-1]]                   = [        'a',  'a',   'c',  'c',  'd']
+    # data[perm[1:]] != data[perm[:-1]] = [      False, True, False, True, True]
+    # unique_mask                       = [True, False, True, False, True, True]
+    #                                        a     a     c      c      d     e
+    unique_mask = np.empty(data.shape, dtype=bool)
+    # First element is new
     unique_mask[:1] = True
+    # Set true on positions where value differs from previous
     unique_mask[1:] = data[perm[1:]] != data[perm[:-1]]
 
+    # Create tmp array that will store reverse sorted mask array
-    # Create tmp array that will store reverse sorted mask array
+    # Create mask array to store True on positions of the data array where new value appears for the first time
-    # Create tmp array that will store reverse sorted mask array
+    # Create mask array to store True on positions of the data array where new value appears for the first time
+    # ex.:
+    #                        a     a     c      c      d     e
-    #                        a     a     c      c      d     e
+    # sorted data array      a     a     c      c      d     e
-    #                        a     a     c      c      d     e
+    # sorted data array      a     a     c      c      d     e
+    # unique_mask       = [True, False, True, False, True, True]
+    # perm              = [   0,      5,   1,     2,    4,    3]
+    # perm[unique_mask] = [   0,           1,           4,    3]
+    # result_mask       = [True, True, False, True, True, False]
+    #                        a     c     c      e      d     a
-    #                        a     c     c      e      d     a
+    # original data array    a     c     c      e      d     a
-    #                        a     c     c      e      d     a
+    # original data array    a     c     c      e      d     a
     result_mask = np.full_like(unique_mask, False)
-    result_mask[:1] = True
     result_mask[perm[unique_mask]] = True
+
+    # Return elements that are first appears of unique values
-    # Return elements that are first appears of unique values
+    # Return array of first occurrences of elements in the data array
-    # Return elements that are first appears of unique values
+    # Return array of first occurrences of elements in the data array
+    # ex.:
+    # data              = [ 'a',  'c',   'c',  'e',  'd',  'a' ]
+    # result_mask       = [True, True, False, True, True, False]
+    # data[result_mask] = [ 'a',  'c',         'e',  'd'       ]
     return data[result_mask]
diff --git a/src/foapy/intervals.py b/src/foapy/intervals.py
@@ -61,52 +61,250 @@ def intervals(X, bind, mod):
         raise ValueError(
             {"message": "Invalid mode value. Use mode.lossy,normal,cycle or redundant."}
         )
-
+    # ex.:
+    # ar = ['a', 'c', 'c', 'e', 'd', 'a']
     ar = np.asanyarray(X)
 
     if ar.shape == (0,):
         return []
 
     if bind == binding.end:
+        # For binding to the end, we need to reverse the array
+        # ar = ['a', 'd', 'e', 'c', 'c', 'a']
         ar = ar[::-1]
 
+    # Sort data positions
-    # Sort data positions
+    # Get original indices of sorted data array
-    # Sort data positions
+    # Get original indices of sorted data array
+    # ex.:
+    #         a  a  c  c  d  e
+    # perm = [0, 5, 1, 2, 4, 3]
     perm = ar.argsort(kind="mergesort")
 
+    # Create tmp mask array to store True on positions where appears new value.
-    # Create tmp mask array to store True on positions where appears new value.
+    # Create mask array to store True on positions where new value appears for the first time in the sorted array to distinguish where subarray of one element ends and another begins
-    # Create tmp mask array to store True on positions where appears new value.
+    # Create mask array to store True on positions where new value appears for the first time in the sorted array to distinguish where subarray of one element ends and another begins
+    # Create shape length +1 of source,
+    # because we want to use the array for all binding modes.
-    # because we want to use the array for all binding modes.
+    # to use it as both first occurrence marker and last occurrence marker depending on the shift of the data array
-    # because we want to use the array for all binding modes.
+    # to use it as both first occurrence marker and last occurrence marker depending on the shift of the data array
+    # ex.:
+    # Create tmp mask array to store True on positions where appears new value
+    # ex.:
+    #              a  a  c  c  d  e
-    #              a  a  c  c  d  e
+    # sorted data array a  a  c  c  d  e
-    #              a  a  c  c  d  e
+    # sorted data array a  a  c  c  d  e
+    # perm      = [0, 5, 1, 2, 4, 3]
+    # perm[1:]  = [   5, 1, 2, 4, 3]
+    # perm[:-1] = [0, 5, 1, 2, 4   ]
+
+    # data[perm[1:]]                    = [        'a',  'c',   'c',  'd',  'e'      ]
+    # data[perm[:-1]]                   = [        'a',  'a',   'c',  'c',  'd'      ]
+    # data[perm[1:]] != data[perm[:-1]] = [      False, True, False, True, True      ]
+    # unique_mask                       = [True, False, True, False, True, True, True]
+    # First appears                          a     a     c      c      d     e
-    # First appears                          a     a     c      c      d     e
+    # First occurrence                       a     a     c      c      d     e
-    # First appears                          a     a     c      c      d     e
+    # First occurrence                       a     a     c      c      d     e
+    # Last appears                                 a     a      c      c     d     e
-    # Last appears                                 a     a      c      c     d     e
+    # Last occurrence                              a     a      c      c     d     e
-    # Last appears                                 a     a      c      c     d     e
+    # Last occurrence                              a     a      c      c     d     e
+
     mask_shape = ar.shape
     mask = np.empty(mask_shape[0] + 1, dtype=bool)
     mask[:1] = True
     mask[1:-1] = ar[perm[1:]] != ar[perm[:-1]]
     mask[-1:] = True  # or  mask[-1] = True
 
+    # Save masks first and last appears of elements
-    # Save masks first and last appears of elements
+    # Create masks of first and last occurrences of elements by excluding first and last elements from unique_mask accordingly
-    # Save masks first and last appears of elements
+    # Create masks of first and last occurrences of elements by excluding first and last elements from unique_mask accordingly
+    # ex.:
+    #
+    # unique_mask = [True, False, True, False, True, True, True]
+    # first_mask  = [True, False, True, False, True, True      ]
+    #                  a     a     c      c      d     e
+    # last_mask   = [      False, True, False, True, True, True]
+    #                        a     a     c      c      d     e
     first_mask = mask[:-1]
     last_mask = mask[1:]
 
+    # Create tmp array to count intervals
     intervals = np.empty(ar.shape, dtype=np.intp)
+
+    # Count intervals between elements.
+    # Intervals of first elements appears would be wrong on that stage.
-    # Intervals of first elements appears would be wrong on that stage.
+    # Intervals of the first occurrence of all elements would be wrong on that stage.
-    # Intervals of first elements appears would be wrong on that stage.
+    # Intervals of the first occurrence of all elements would be wrong on that stage.
+    # We will fix that later.
+    # ex.:
+    #                         a  a   c  c  d   e
+    # perm                 = [0, 5,  1, 2, 4,  3]
+    # perm[1:]             = [   5,  1, 2, 4,  3]
+    # perm[:-1]            = [   0,  5, 1, 2,  4]
+    # perm[1:] - perm[:-1] = [   5, -4, 1, 2, -1]
+    # intervals            = [0, 5, -4, 1, 2, -1]
+    #                         ^      ^         ^ - wrong intervals
     intervals[1:] = perm[1:] - perm[:-1]
 
+    # Fix first and last intervals
+    # For any mode except cycle delta would be 1
+    # For cycle mode delta would be an array
+
+    # ex.:
+    # len(ar)                   = 6
+    #                                  a     a      c     c     d     e
+    # perm                      = [    0,    5,     1,    2,    4,    3]
+    # last_mask                 = [False, True, False, True, True, True]
+    # perm[last_mask]           = [          5,           2,    4,    3]
+    # len(ar) - perm[last_mask] = [          1,           4,    2,    3]
+    # delta                     = [          1,           4,    2,    3]
+    #                                        a            c     d     e
     delta = len(ar) - perm[last_mask] if mod == mode.cycle else 1
+
+    # ex.:
+    #                                  a     a      c     c     d     e
+    # perm                      = [    0,    5,     1,    2,    4,    3]
+    # first_mask                = [True, False, True, False, True, True]
+    # perm[first_mask]          = [    0,           1,           4,    3]
+    #                                  a            c            d     e
+    # For all modes except cycle
+    #                                 a      a     c      c     d     e
+    # intervals                 = [   0,     5,   -4,     1,    2,   -1]
+    # perm[first_mask] + delta  = [   1,           2,           5,    4]
+    # first_mask                = [True, False, True, False, True, True]
+    # intervals                 = [   1,     5,    2,     1,    5,    4]
+    #                                 a      a     c      c     d     e
+
+    # For cycle mode
+    #                                 a      a     c      c     d     e
+    # intervals                 = [   0,      5,  -4,     1,    2,   -1]
+    # first_mask                = [True, False, True, False, True, True]
+    # perm[first_mask]          = [   0,           1,           4,    3]
+    # delta                     = [   1,           4,           2,    3]
+    # perm[first_mask] + delta  = [   1,           5,           6,    6]
+    # intervals                 = [   1,     5,    5,     1,    6,    6]
+    #                                 a      a     c      c     d     e
     intervals[first_mask] = perm[first_mask] + delta
 
+    # Create inverse permutation array
     inverse_perm = np.empty(ar.shape, dtype=np.intp)
+    # ex.:
+    #                           a  a  c  c  d  e
+    # perm                   = [0, 5, 1, 2, 4, 3]
+    # np.arange(ar.shape[0]) = [0, 1, 2, 3, 4, 5]
+    # inverse_perm           = [0, 2, 3, 5, 4, 1]
+    #                           a  c  c  e  d  a
     inverse_perm[perm] = np.arange(ar.shape[0])
 
+    # Create result array depending on mode
     if mod == mode.lossy:
+        # For lossy mode we ignore intervals for a first appearance of the element
+        # ex.:
+        #                                 a      a     c      c     d     e
+        # intervals                 = [   1,     5,    5,     1,    6,    6]
+        # first_mask                = [True, False, True, False, True, True]
+        # intervals                 = [   0,     5,    0,     1,    0,    0]
+        #                                 a      a     c      c     d     e
         intervals[first_mask] = 0
+
+        # Permute intervals array to the original order
-        # Permute intervals array to the original order
+        # Permute intervals array to the original arrangement
-        # Permute intervals array to the original order
+        # Permute intervals array to the original arrangement
+        # ex.:
+        #                              a  a  c  c  d  e
+        # intervals                 = [0, 5, 0, 1, 0, 0]
+        # inverse_perm              = [0, 2, 3, 5, 4, 1]
+        # intervals                 = [0, 0, 1, 0, 0, 5]
+        #                              a  c  c  e  d  a
         intervals = intervals[inverse_perm]
+
+        # Remove zeros from the array
+        # ex.:
+        #                              a  c  c  e  d  a
+        # intervals                 = [0, 0, 1, 0, 0, 5]
+        # intervals[intervals != 0] = [      1,       5]
+        # result                    = [      1,       5]
+        #                                    c        a
         result = intervals[intervals != 0]
     elif mod == mode.normal:
+        # For normal mode we permute intervals array to the original order
-        # For normal mode we permute intervals array to the original order
+        # For normal mode we permute intervals array to the original arrangement
-        # For normal mode we permute intervals array to the original order
+        # For normal mode we permute intervals array to the original arrangement
+        # ex.:
+        #                            a  a  c  c  d  e
+        # intervals               = [1, 5, 2, 1, 5, 4]
+        # inverse_perm            = [0, 2, 3, 5, 4, 1]
+        # intervals[inverse_perm] = [1, 2, 1, 4, 5, 5]
+        #                            a  c  c  e  d  a
+        # result                  = [1, 2, 1, 4, 5, 5]
         result = intervals[inverse_perm]
     elif mod == mode.cycle:
+        # For cycle mode we permute intervals array to the original order
-        # For cycle mode we permute intervals array to the original order
+        # For cycle mode we permute intervals array to the original arrangement
-        # For cycle mode we permute intervals array to the original order
+        # For cycle mode we permute intervals array to the original arrangement
+        # ex.:
+        #                            a  a  c  c  d  e
+        # intervals               = [1, 5, 5, 1, 6, 6]
+        # inverse_perm            = [0, 2, 3, 5, 4, 1]
+        # intervals[inverse_perm] = [1, 2, 1, 4, 5, 5]
+        #                            a  c  c  e  d  a
+        # result                  = [1, 5, 1, 6, 5, 5]
         result = intervals[inverse_perm]
     elif mod == mode.redundant:
+        # For redundant mode we need to count intervals for the first and last
+        # appearance of an element
+
+        # ex.:
+        #                            a  a  c  c  d  e
+        # intervals               = [1, 5, 2, 1, 5, 4]
+        # inverse_perm            = [0, 2, 3, 5, 4, 1]
+        # intervals[inverse_perm] = [1, 2, 1, 4, 5, 5]
+        #                            a  c  c  e  d  a
+        # result                  = [1, 2, 1, 4, 5, 5]
+
+        # Create 2-dimensional array size of (2, len(ar))
+        # Zero row is for intervals the first appearance of the element and intervals
-        # Zero row is for intervals the first appearance of the element and intervals
+        # Zero row is for the intervals of the first appearance of the element and intervals
-        # Zero row is for intervals the first appearance of the element and intervals
+        # Zero row is for the intervals of the first appearance of the element and intervals
+        # for intermediate appearances
+        # First row will store intervals for the last appearance of the element
-        # First row will store intervals for the last appearance of the element
+        # First row will store only intervals for the last appearance of the elements
-        # First row will store intervals for the last appearance of the element
+        # First row will store only intervals for the last appearance of the elements
         result = np.zeros(shape=ar.shape + (2,), dtype=int)
+
+        # ex.:
+        #                a  a  c  c  d  e
+        # intervals =   [1, 5, 2, 1, 5, 4]
+        # result    = [
+        #               [1, 5, 2, 1, 5, 4]
+        #               [0, 0, 0, 0, 0, 0]
+        #             ]
         result[:, 0] = intervals
+
+        # Set intervals for the last appearance of the element to the first row
+
+        # ex.:
+        #                                  a     a      c     c     d     e
+        # perm                      = [    0,    5,     1,    2,    4,    3]
+        # last_mask                 = [False, True, False, True, True, True]
+        # perm[last_mask]           = [          5,           2,    4,    3]
+        # len(ar) - perm[last_mask] = [          1,           4,    2,    3]
+        # result                    = [
+        #                               [   1,    5,    2,    1,    5,    4]
+        #                               [   0,    1,    0,    4,    2,    3]
+        #                             ]
         result[last_mask, 1] = len(ar) - perm[last_mask]
+
+        # Permute intervals array to the original order
-        # Permute intervals array to the original order
+        # Permute intervals array to the original arrangement
-        # Permute intervals array to the original order
+        # Permute intervals array to the original arrangement
+        # ex.:
+        #                           a  a  c  c  d  e
+        # result               = [
+        #                          [1, 5, 2, 1, 5, 4]
+        #                          [0, 1, 0, 4, 2, 3]
+        #                        ]
+        # inverse_perm         =   [0, 2, 3, 5, 4, 1]
+        # result[inverse_perm] = [
+        #                          [1, 2, 1, 4, 5, 5]
+        #                          [0, 0, 4, 3, 2, 1]
+        #                        ]
+        #                           a  c  c  e  d  a
         result = result[inverse_perm]
+
+        # Flatten result array
+        # ex.:
+        #                           a  c  c  e  d  a
+        # result[inverse_perm] = [
+        #                          [1, 2, 1, 4, 5, 5]
+        #                          [0, 0, 4, 3, 2, 1]
+        #                        ]
+        # result.ravel()       = [ 1, 0, 2, 0, 1, 4, 4, 3, 5, 2, 5, 1]
+        #                        |  a  |  c  |  c  |  e  |  d  |  a  |
         result = result.ravel()
+
+        # Exclude zeros from the result
+        # result               = [ 1, 0, 2, 0, 1, 4, 4, 3, 5, 2, 5, 1]
+        #                        |  a  |  c  |  c  |  e  |  d  |  a  |
+
+        # result[result != 0] = [ 1, 2, 1, 4, 4, 3, 5, 2, 5, 1]
+        #                       |a |c |  c  |  e  |  d  |  a  |
         result = result[result != 0]
 
     if bind == binding.end:
+        # For binding to the end, we need to reverse the result
-        # For binding to the end, we need to reverse the result
+        # For binding to the end, we need to reverse the result back
-        # For binding to the end, we need to reverse the result
+        # For binding to the end, we need to reverse the result back
         result = result[::-1]
 
     return result