|
@@ -0,0 +1,119 @@
|
|
|
|
+# взаимная проверка всех файлов в указанной папке
|
|
|
|
+import os
|
|
|
|
+from difflib import SequenceMatcher
|
|
|
|
+from tqdm import tqdm
|
|
|
|
+import datetime
|
|
|
|
+import requests
|
|
|
|
+
|
|
|
|
+# download stopwords corpus, you need to run it once
|
|
|
|
+import nltk
|
|
|
|
+#nltk.download("stopwords")
|
|
|
|
+from nltk.corpus import stopwords
|
|
|
|
+import pymorphy2
|
|
|
|
+from string import punctuation
|
|
|
|
+
|
|
|
|
+from thefuzz import fuzz
|
|
|
|
+
|
|
|
|
+# ------------------------------- НАСТРОЙКИ ------------
|
|
|
|
+# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
|
|
|
|
+BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
|
|
|
|
+# проверяемая директория
|
|
|
|
+LECTION_DIR = os.path.join(BASE_DIR, "EASvZI", "2022-23", "Самостоятельная_работа_1")
|
|
|
|
+
|
|
|
|
+# ------------------------------- / НАСТРОЙКИ ------------
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def log(str: str = None):
|
|
|
|
+ global out_str
|
|
|
|
+ if str == None:
|
|
|
|
+ print()
|
|
|
|
+ out_str += "\n"
|
|
|
|
+ else:
|
|
|
|
+ print(str)
|
|
|
|
+ out_str += f"{str}\n"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+#Create lemmatizer and stopwords list
|
|
|
|
+morph = pymorphy2.MorphAnalyzer()
|
|
|
|
+russian_stopwords = stopwords.words("russian")
|
|
|
|
+
|
|
|
|
+#Preprocess function
|
|
|
|
+def preprocess_text(text):
|
|
|
|
+ translator = str.maketrans(punctuation, ' '*len(punctuation))
|
|
|
|
+ words = text.translate(translator)
|
|
|
|
+ words = words.lower().split()
|
|
|
|
+
|
|
|
|
+ # очистка от прилегающего к слову мусора (слово, "или так")
|
|
|
|
+ clear_words = []
|
|
|
|
+ for word in words:
|
|
|
|
+ clear_word = ""
|
|
|
|
+ for s in word:
|
|
|
|
+ if not s in punctuation:
|
|
|
|
+ clear_word = clear_word + s
|
|
|
|
+ clear_words.append(clear_word)
|
|
|
|
+ tokens = []
|
|
|
|
+ tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
|
|
|
|
+ and token != " " \
|
|
|
|
+ and token.strip() not in punctuation \
|
|
|
|
+ ]
|
|
|
|
+
|
|
|
|
+ text = " ".join(tokens)
|
|
|
|
+ return tokens, text
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+out_str = ""
|
|
|
|
+now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
|
|
|
|
+log(f"Время проверки: {now}")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+files_paths = []
|
|
|
|
+files = os.listdir(LECTION_DIR)
|
|
|
|
+for file in files:
|
|
|
|
+ file_path = os.path.join(LECTION_DIR, file)
|
|
|
|
+ filename, fileext = os.path.splitext(file)
|
|
|
|
+
|
|
|
|
+ if os.path.isfile(file_path) and (fileext=='.md'):
|
|
|
|
+ files_paths.append(file_path)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+for file_1 in files_paths:
|
|
|
|
+ for file_2 in files_paths:
|
|
|
|
+ if (file_1 != file_2):
|
|
|
|
+ small_filename_1 = str(file_1).replace(LECTION_DIR, "").strip("\\")
|
|
|
|
+ small_filename_2 = str(file_2).replace(LECTION_DIR, "").strip("\\")
|
|
|
|
+ try:
|
|
|
|
+ with open(file_1, encoding="utf-8") as f_1:
|
|
|
|
+ str1 = f_1.read()
|
|
|
|
+ f_1.close()
|
|
|
|
+ except:
|
|
|
|
+ with open(file_1, encoding="cp1251") as f_1:
|
|
|
|
+ str1 = f_1.read()
|
|
|
|
+ f_1.close()
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ with open(file_2, encoding="utf-8") as f_2:
|
|
|
|
+ str2 = f_2.read()
|
|
|
|
+ f_2.close()
|
|
|
|
+ except:
|
|
|
|
+ with open(file_2, encoding="cp1251") as f_2:
|
|
|
|
+ str2 = f_2.read()
|
|
|
|
+ f_2.close()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ str1_tokens, str1_uniq_text = preprocess_text(str1)
|
|
|
|
+ str2_tokens, str2_uniq_text = preprocess_text(str2)
|
|
|
|
+
|
|
|
|
+ # пересечение множеств
|
|
|
|
+ min_tokens_len = min([len(set(str1_tokens)), len(set(str2_tokens))])
|
|
|
|
+ c = list(set(str1_tokens) & set(str2_tokens))
|
|
|
|
+ ratio = (1 - (len(c) / min_tokens_len)) * 100
|
|
|
|
+ log(f"уникальность {small_filename_1} / {small_filename_2}: {ratio:.2f}%")
|
|
|
|
+ log()
|
|
|
|
+
|
|
|
|
+with open(os.path.join(LECTION_DIR, "log.txt"), "w", encoding="utf-8") as f_log:
|
|
|
|
+ f_log.write(out_str)
|
|
|
|
+ f_log.close()
|
|
|
|
+
|
|
|
|
+
|