Spoken-tutorial · MetalBlazer · Jul 6, 2018 · Jul 6, 2018 · Jul 6, 2018 · Jul 6, 2018
diff --git a/OnlySpam.csv b/OnlySpam.csv
diff --git a/OnlySpam.py b/OnlySpam.py
@@ -0,0 +1,115 @@
+import pandas as pd
+from bs4 import BeautifulSoup
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.svm import LinearSVC
+import pickle
+fields = ['Content', 'Spam']
+
+df = pd.read_csv('OnlySpam.csv', usecols=fields, skipinitialspace=True)
+# TAG_RE = re.compile(r'<a.*?>.*?</a>')
+
+
+def remove_tags(text):
+    # string = TAG_RE.sub('hyperlink',text)
+    soup = BeautifulSoup(text, "lxml")
+    if soup.find_all('style'):
+        soup.style.decompose()
+    string = soup.get_text()
+    string = string.replace('&nbsp;', '').replace(
+        '\n', '').replace('\r', '').replace('\t', '')
+    string = ' '.join([w for w in string.split() if len(w) >= 3])
+    return string
+
+
+df['Content'] = df['Content'].apply(remove_tags)
+
+vectorizer = TfidfVectorizer(stop_words='english')
+
+x_train = vectorizer.fit_transform(df['Content'])
+
+model = LinearSVC()
+
+model.fit(x_train, df['Spam'])
+filename = 'spam_model.sav'
+
+pickle.dump(model, open(filename, 'wb'))
+from django.conf import settings
+
+def predictorspam(comment, foss_id, tdid):
+    clean_data = os_walk(foss_id, tdid)
+    clean_data = clean_data.split(".")
+    my_dict = {}
+
+    for data in clean_data:
+        try:
+            my_dict["Content"].append(data)
+            my_dict["Spam"].append(2)
+        except:
+            my_dict["Content"] = [data]
+            my_dict["Spam"] = [2]
+
+    # 0 - Spam
+    # 1 - Training related
+    # 2 - Tutorial related
+    new_df = pd.DataFrame(data=my_dict)
+    df = pd.read_csv('cuss.csv', usecols=fields, skipinitialspace=True)
+    frame = [new_df,df]
+    result_df = pd.concat(frame)
+
+    result_df['Content'] = result_df['Content'].apply(remove_tags)
+    vectorizer = TfidfVectorizer(stop_words='english')
+    x_train = vectorizer.fit_transform(result_df['Content'])
+    model = LinearSVC()
+
+    model.fit(x_train, result_df['Spam'])
+    filename = 'spam_model.sav'
+
+    pickle.dump(model, open(filename, 'wb'))
+    simplified = remove_tags(comment)
+    tester = [simplified]
+    contest = vectorizer.transform(tester)
+    load_model = pickle.load(open(filename, 'rb'))
+    a = load_model.predict(contest)
+    #Error here
+    #a = load_model.predict(clean_data)
+    return a[0]
+import re
+import os
+def get_script_data(root,file):
+    data = ""
+    with open(root+'/'+file) as docfile:
+        print "\n==================="
+        data += docfile.read()
+        print "\n==================="
+
+        data_parsed = re.sub('[^A-Z a-z .]+', '', data)
+        return data_parsed.lower()
+
+#VIDEO_PATH = '/datas/websites/saurabh-a/spoken-website/media/videos/'
+from config import VIDEO_PATH
+def os_walk(foss_id, tdid):
+    data = ""
+    filepath = VIDEO_PATH +'/' +str(foss_id) + '/'
+    print "filepath :",filepath
+
+    for root, dirs, files in os.walk(filepath):
+        if not dirs:
+            print(root, "is a directory without subdirectories")
+            # do whatever you need to do with your files here
+        else:
+            print "root : ",root
+            print "dirs : ", dirs
+            #files = [ fi for fi in files if fi.endswith("English.srt") ]
+            for folder in dirs:
+                print("{0} - {1}".format(type(folder),type(tdid)))
+                if folder == str(tdid):
+                    sub_filepath = filepath + folder + '/'
+                    for sub_root, sub_dirs, sub_files in os.walk(sub_filepath):
+                        if sub_dirs:
+                            files = [ fi for fi in sub_files if fi.endswith("English.srt") ]
+                            print "--",files
+                            for file in files:
+                                data += get_script_data(sub_root,file)
+
+        return data
+
diff --git a/README.md b/README.md
@@ -4,3 +4,10 @@
 1. Create new database and import forums_empty.sql file from 'data' folder.
 2. Create copy the `forums/config.sample.py` to `forums.config.py` and add the values accordingly.
 3. To override any settings, create `forums/local_settings.py` and add the settings there.
+
+# For Spam filter module
+
+1. Install the required dependencies by running `pip install -r mlrequirements.txt`
+2. On future edits in database and/or model scripts rerun the concerned files in the hosted environment.
+    `python Spoken.py`
+    `python OnlySpam.py`
diff --git a/STdataset.csv b/STdataset.csv
diff --git a/Spoken.py b/Spoken.py
@@ -0,0 +1,68 @@
+# Libraries
+import pandas as pd
+from bs4 import BeautifulSoup
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.svm import LinearSVC
+# from nltk import word_tokenize
+# from nltk.stem import WordNetLemmatizer
+import pickle
+from imblearn.over_sampling import SMOTE
+fields = ['Content', 'Label']
+
+# Load into dataframe
+df = pd.read_csv('STdataset.csv', skipinitialspace=True, usecols=fields)
+
+# Stripping function
+
+
+def remove_tags(text):
+    soup = BeautifulSoup(text, "lxml")
+    if soup.find_all('style'):
+        soup.style.decompose()
+    string = soup.get_text()
+    string = string.replace('&nbsp;', '').replace(
+        '\n', '').replace('\r', '').replace('\t', '')
+    string = ' '.join([w for w in string.split() if len(w) >= 3])
+    return string
+
+
+df['Content'] = df['Content'].apply(remove_tags)
+
+# Lemmatizer
+
+'''
+class LemmaTokenizer(object):
+    def __init__(self):
+        self.wnl = WordNetLemmatizer()
+
+    def __call__(self, doc):
+        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
+'''
+
+# Vectorizer
+vectorizer = TfidfVectorizer(stop_words='english')
+
+x = vectorizer.fit_transform(df['Content'])
+
+# Minority oversampling
+sm = SMOTE(random_state=42)
+
+x, y = sm.fit_sample(x, df['Label'])
+
+# Model fitting
+model = LinearSVC(random_state=42, tol=5, fit_intercept=False)
+model.fit(x, y)
+filename = 'tutorial_model.sav'
+pickle.dump(model, open(filename, 'wb'))
+
+# Predictor function
+
+
+def predictor(comment):
+    simplified = remove_tags(comment)
+    tester = [simplified]
+    print(simplified)
+    contest = vectorizer.transform(tester)
+    load_model = pickle.load(open(filename, 'rb'))
+    a = load_model.predict(contest)
+    return a[0]