feat(model): Add linearsvc agent

its-sushant · its-sushant · commit 58e27848b652 · 2022-07-21T12:53:18.000+05:30
diff --git a/README.md b/README.md
@@ -60,6 +60,9 @@ Get the help by running `atarashi -h` or `atarashi --help`
 - Running **wordFrequencySimilarity** agent
 
     `atarashi -a wordFrequencySimilarity /path/to/file.c`
+- Running **linearsvc** agent
+
+    `atarashi -a linearsvc /path/to/file.c`
 - Running **tfidf** agent
     - With **Cosine similarity**
 
diff --git a/atarashi/agents/linearsvc.py b/atarashi/agents/linearsvc.py
@@ -0,0 +1,90 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2022 Sushant Kumar (sushantmishra02102002@gmail.com)
+SPDX-License-Identifier: GPL-2.0
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+version 2 as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+
+__author__ = 'Sushant Kumar'
+__email__ = 'sushantmishra02102002@gmail.com'
+
+import argparse
+
+from atarashi.agents.atarashiAgent import AtarashiAgent
+from atarashi.libs.initialmatch import spdx_identifer
+from linearsvc import linearsvc
+
+
+class Linearsvc(AtarashiAgent):
+
+    def __init__(self, licenseList):
+        super().__init__(licenseList)
+
+    def predict_shortname(self, processed_comment):
+        '''
+        :param filePath: extracted and preprocessed comment
+        :return: Returns the predicted license's short name
+        '''
+
+        processed_comment = [processed_comment]
+        classifier = linearsvc(processed_comment)
+        predictor = classifier.classify()
+        return predictor.predict(processed_comment)
+
+    def scan(self, filePath):
+        '''
+        Read the content of filename, extract the comments and preprocess them.
+        Find the predicted short name for the preprocessed file.
+        :param filePath: Path of the file to scan
+        :return: Returns the license's short name
+        '''
+
+        match = []
+
+        with open(filePath) as file:
+            raw_data = file.read()
+
+        spdx_identifers = spdx_identifer(raw_data,
+                                         self.licenseList['shortname'])
+        if spdx_identifers:
+            match.extend(spdx_identifers)
+        else:
+            processed_comment = super().loadFile(filePath)
+            license_name = self.predict_shortname(processed_comment)
+
+            match.append({
+                'shortname': str(license_name[0]),
+                'sim_score': 1.0,
+                'sim_type': 'linearsvc',
+                'description': '',
+            })
+        return match
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('processedLicenseList',
+                        help='Specify the processed license list file')
+    parser.add_argument('inputFile',
+                        help='Specify the input file which needs to be scanned'
+                        )
+
+    args = parser.parse_args()
+
+    licenseList = args.processedLicenseList
+    filename = args.inputFile
+
+    scanner = Linearsvc(licenseList)
+    scanner.scan(filename)
diff --git a/atarashi/atarashii.py b/atarashi/atarashii.py
@@ -28,6 +28,7 @@
 from atarashi.agents.dameruLevenDist import DameruLevenDist
 from atarashi.agents.tfidf import TFIDF
 from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
+from atarashi.agents.linearsvc import Linearsvc
 
 __author__ = "Aman Jain"
 __email__ = "amanjain5221@gmail.com"
@@ -78,6 +79,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
     scanner = WordFrequencySimilarity(processedLicense)
   elif agent_name == "DLD":
     scanner = DameruLevenDist(processedLicense)
+  elif agent_name == "linearsvc":
+    scanner = Linearsvc(processedLicense)
   elif agent_name == "tfidf":
     scanner = TFIDF(processedLicense)
     if similarity == "CosineSim":
@@ -128,7 +131,7 @@ def main():
   parser.add_argument("-l", "--processedLicenseList", required=False,
                       help="Specify the location of processed license list file")
   parser.add_argument("-a", "--agent_name", required=True,
-                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
+                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'linearsvc'],
                       help="Name of the agent that needs to be run")
   parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
                       choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
diff --git a/atarashi/evaluator/evaluator.py b/atarashi/evaluator/evaluator.py
@@ -118,7 +118,7 @@ def evaluate(scanner):
   defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
   parser = argparse.ArgumentParser()
   parser.add_argument("-a", "--agent_name", required=True,
-                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
+                      choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'linearsvc'],
                       help="Name of the agent that needs to be run")
   parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
                       choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,10 +5,11 @@ requires = [
   "numpy>=1.16.0",
   "tqdm>=4.42.0",
   "pandas>=0.23.1",
-  "scikit-learn>=0.18.1",
+  "scikit-learn==1.1.1",
   "scipy>=0.18.1",
   "textdistance>=3.0.3",
   "pyxDamerauLevenshtein>=1.5",
   "nirjas>=0.0.5",
-  "urllib3>=1.24.1"
+  "urllib3>=1.24.1",
+  "linearsvc>=0.1.1"
 ]
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,11 @@
 tqdm>=4.42.0
 pandas>=0.23.1
 pyxDamerauLevenshtein>=1.5
-scikit-learn>=0.18.1
+scikit-learn==1.1.1
 scipy>=0.18.1
 spacy>=2.0.11
 textdistance>=3.0.3
 setuptools>=39.2.0
 nirjas>=0.0.5
 urllib3>=1.24.1
+linearsvc>=0.1.1
diff --git a/setup.py b/setup.py
@@ -68,7 +68,8 @@ def read(fname):
   'textdistance>=3.0.3',
   'pyxDamerauLevenshtein>=1.5',
   'urllib3>=1.24.1',
-  'nirjas>=0.0.5'
+  'nirjas>=0.0.5',
+  'linearsvc>=0.1.1'
 ]
 
 class BuildAtarashiDependencies(distutils.cmd.Command):

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,8 @@ def read(fname):`
`68`	`68`	`'textdistance>=3.0.3',`
`69`	`69`	`'pyxDamerauLevenshtein>=1.5',`
`70`	`70`	`'urllib3>=1.24.1',`
`71`		`- 'nirjas>=0.0.5'`
	`71`	`+ 'nirjas>=0.0.5',`
	`72`	`+ 'linearsvc>=0.1.1'`
`72`	`73`	`]`
`73`	`74`
`74`	`75`	`class BuildAtarashiDependencies(distutils.cmd.Command):`