Skip to content

Commit 58e2784

Browse files
committed
feat(model): Add linearsvc agent
1 parent 5071444 commit 58e2784

File tree

7 files changed

+105
-6
lines changed

7 files changed

+105
-6
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ Get the help by running `atarashi -h` or `atarashi --help`
6060
- Running **wordFrequencySimilarity** agent
6161

6262
`atarashi -a wordFrequencySimilarity /path/to/file.c`
63+
- Running **linearsvc** agent
64+
65+
`atarashi -a linearsvc /path/to/file.c`
6366
- Running **tfidf** agent
6467
- With **Cosine similarity**
6568

atarashi/agents/linearsvc.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Copyright 2022 Sushant Kumar ([email protected])
6+
SPDX-License-Identifier: GPL-2.0
7+
This program is free software; you can redistribute it and/or
8+
modify it under the terms of the GNU General Public License
9+
version 2 as published by the Free Software Foundation.
10+
This program is distributed in the hope that it will be useful,
11+
but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
GNU General Public License for more details.
14+
You should have received a copy of the GNU General Public License along
15+
with this program; if not, write to the Free Software Foundation, Inc.,
16+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17+
"""
18+
19+
__author__ = 'Sushant Kumar'
20+
__email__ = '[email protected]'
21+
22+
import argparse
23+
24+
from atarashi.agents.atarashiAgent import AtarashiAgent
25+
from atarashi.libs.initialmatch import spdx_identifer
26+
from linearsvc import linearsvc
27+
28+
29+
class Linearsvc(AtarashiAgent):
30+
31+
def __init__(self, licenseList):
32+
super().__init__(licenseList)
33+
34+
def predict_shortname(self, processed_comment):
35+
'''
36+
:param filePath: extracted and preprocessed comment
37+
:return: Returns the predicted license's short name
38+
'''
39+
40+
processed_comment = [processed_comment]
41+
classifier = linearsvc(processed_comment)
42+
predictor = classifier.classify()
43+
return predictor.predict(processed_comment)
44+
45+
def scan(self, filePath):
46+
'''
47+
Read the content of filename, extract the comments and preprocess them.
48+
Find the predicted short name for the preprocessed file.
49+
:param filePath: Path of the file to scan
50+
:return: Returns the license's short name
51+
'''
52+
53+
match = []
54+
55+
with open(filePath) as file:
56+
raw_data = file.read()
57+
58+
spdx_identifers = spdx_identifer(raw_data,
59+
self.licenseList['shortname'])
60+
if spdx_identifers:
61+
match.extend(spdx_identifers)
62+
else:
63+
processed_comment = super().loadFile(filePath)
64+
license_name = self.predict_shortname(processed_comment)
65+
66+
match.append({
67+
'shortname': str(license_name[0]),
68+
'sim_score': 1.0,
69+
'sim_type': 'linearsvc',
70+
'description': '',
71+
})
72+
return match
73+
74+
75+
if __name__ == '__main__':
76+
77+
parser = argparse.ArgumentParser()
78+
parser.add_argument('processedLicenseList',
79+
help='Specify the processed license list file')
80+
parser.add_argument('inputFile',
81+
help='Specify the input file which needs to be scanned'
82+
)
83+
84+
args = parser.parse_args()
85+
86+
licenseList = args.processedLicenseList
87+
filename = args.inputFile
88+
89+
scanner = Linearsvc(licenseList)
90+
scanner.scan(filename)

atarashi/atarashii.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from atarashi.agents.dameruLevenDist import DameruLevenDist
2929
from atarashi.agents.tfidf import TFIDF
3030
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
31+
from atarashi.agents.linearsvc import Linearsvc
3132

3233
__author__ = "Aman Jain"
3334
__email__ = "[email protected]"
@@ -78,6 +79,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
7879
scanner = WordFrequencySimilarity(processedLicense)
7980
elif agent_name == "DLD":
8081
scanner = DameruLevenDist(processedLicense)
82+
elif agent_name == "linearsvc":
83+
scanner = Linearsvc(processedLicense)
8184
elif agent_name == "tfidf":
8285
scanner = TFIDF(processedLicense)
8386
if similarity == "CosineSim":
@@ -128,7 +131,7 @@ def main():
128131
parser.add_argument("-l", "--processedLicenseList", required=False,
129132
help="Specify the location of processed license list file")
130133
parser.add_argument("-a", "--agent_name", required=True,
131-
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
134+
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'linearsvc'],
132135
help="Name of the agent that needs to be run")
133136
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
134137
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],

atarashi/evaluator/evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def evaluate(scanner):
118118
defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
119119
parser = argparse.ArgumentParser()
120120
parser.add_argument("-a", "--agent_name", required=True,
121-
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
121+
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'linearsvc'],
122122
help="Name of the agent that needs to be run")
123123
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
124124
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@ requires = [
55
"numpy>=1.16.0",
66
"tqdm>=4.42.0",
77
"pandas>=0.23.1",
8-
"scikit-learn>=0.18.1",
8+
"scikit-learn==1.1.1",
99
"scipy>=0.18.1",
1010
"textdistance>=3.0.3",
1111
"pyxDamerauLevenshtein>=1.5",
1212
"nirjas>=0.0.5",
13-
"urllib3>=1.24.1"
13+
"urllib3>=1.24.1",
14+
"linearsvc>=0.1.1"
1415
]

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
tqdm>=4.42.0
22
pandas>=0.23.1
33
pyxDamerauLevenshtein>=1.5
4-
scikit-learn>=0.18.1
4+
scikit-learn==1.1.1
55
scipy>=0.18.1
66
spacy>=2.0.11
77
textdistance>=3.0.3
88
setuptools>=39.2.0
99
nirjas>=0.0.5
1010
urllib3>=1.24.1
11+
linearsvc>=0.1.1

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ def read(fname):
6868
'textdistance>=3.0.3',
6969
'pyxDamerauLevenshtein>=1.5',
7070
'urllib3>=1.24.1',
71-
'nirjas>=0.0.5'
71+
'nirjas>=0.0.5',
72+
'linearsvc>=0.1.1'
7273
]
7374

7475
class BuildAtarashiDependencies(distutils.cmd.Command):

0 commit comments

Comments
 (0)