2 years ago · b0dd340bb0
--- a/plagiat_1.v2.py
+++ b/plagiat_1.v2.py
@@ -18,11 +18,11 @@ from thefuzz import fuzz
 
				 BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
			
 
				 # проверяемая директория
			
 
				 # LECTION_DIR = os.path.join("ISRPO", "Лекции")
			
 
				-LECTION_DIR = os.path.join("EASvZI", "Лекции")
			
 
				-# LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
			
 
				+# LECTION_DIR = os.path.join("EASvZI", "Лекции")
			
 
				+LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
			
 
				 
			
 
				 # ссылка для проверки
			
 
				-url = "http://213.155.192.79:3001/u20-24osipenko/EASvZI/raw/14b6dd3163277c86ece71dd677d8c571b0623e9e/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.7.100_%d0%9e%d0%b1%d1%89%d0%b8%d0%b5_%d1%82%d1%80%d0%b5%d0%b1%d0%be%d0%b2%d0%b0%d0%bd%d0%b8%d1%8f_%d0%bf%d0%be_%d0%b7%d0%b0%d1%89%d0%b8%d1%82%d0%b5_%d0%bf%d0%b5%d1%80%d1%81%d0%be%d0%bd%d0%b0%d0%bb%d1%8c%d0%bd%d1%8b%d1%85_%d0%b4%d0%b0%d0%bd%d0%bd%d1%8b%d1%85/1.7.100_%d0%9e%d0%b1%d1%89%d0%b8%d0%b5_%d1%82%d1%80%d0%b5%d0%b1%d0%be%d0%b2%d0%b0%d0%bd%d0%b8%d1%8f_%d0%bf%d0%be_%d0%b7%d0%b0%d1%89%d0%b8%d1%82%d0%b5_%d0%bf%d0%b5%d1%80%d1%81%d0%be%d0%bd%d0%b0%d0%bb%d1%8c%d0%bd%d1%8b%d1%85_%d0%b4%d0%b0%d0%bd%d0%bd%d1%8b%d1%85..md"
			
 
				+url = "http://213.155.192.79:3001/u20-24goncharov/EASvZI/raw/9bfc4f68f89672173113ce27c707265bac180874/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.2.300_%d0%a1%d1%82%d0%b0%d0%b4%d0%b8%d0%b8_%d0%b6%d0%b8%d0%b7%d0%bd%d0%b5%d0%bd%d0%bd%d0%be%d0%b3%d0%be_%d1%86%d0%b8%d0%ba%d0%bb%d0%b0_%d0%90%d0%98%d0%a1/Goncharov.md"
			
 
				 
			
 
				 # ------------------------------- / НАСТРОЙКИ ------------
			
 
				 
			
--- a/plagiat_1_full.v2.py
+++ b/plagiat_1_full.v2.py
@@ -0,0 +1,140 @@
 
				+# Проверка одной со всеми из readme (для диф.зачетов)
			
 
				+
			
 
				+import os
			
 
				+from difflib import SequenceMatcher
			
 
				+from tqdm import tqdm
			
 
				+import datetime
			
 
				+import requests
			
 
				+
			
 
				+# download stopwords corpus, you need to run it once
			
 
				+import nltk
			
 
				+#nltk.download("stopwords")
			
 
				+from nltk.corpus import stopwords
			
 
				+import pymorphy2
			
 
				+from string import punctuation
			
 
				+
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+# ------------------------------- НАСТРОЙКИ ------------
			
 
				+# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
			
 
				+BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
			
 
				+# проверяемая директория
			
 
				+# LECTION_DIR = os.path.join("ISRPO", "Лекции")
			
 
				+# LECTION_DIR = os.path.join("EASvZI", "Лекции")
			
 
				+LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
			
 
				+
			
 
				+# ссылка для проверки
			
 
				+url = "http://213.155.192.79:3001/Chubarov/TZI/raw/299653167a99931f26f9db2256ec10cf37973dc9/2022-23/%d0%94%d0%b8%d1%84.%d0%b7%d0%b0%d1%87%d0%b5%d1%82_2%d1%81%d0%b5%d0%bc/%d0%92%d0%be%d0%bf%d1%80%d0%be%d1%81_3_%d1%80%d0%b0%d0%b7%d0%b4%d0%b5%d0%bb_2.md"
			
 
				+
			
 
				+# ------------------------------- / НАСТРОЙКИ ------------
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+#Create lemmatizer and stopwords list
			
 
				+morph = pymorphy2.MorphAnalyzer()
			
 
				+russian_stopwords = stopwords.words("russian")
			
 
				+
			
 
				+#Preprocess function
			
 
				+def preprocess_text(text):
			
 
				+    translator = str.maketrans(punctuation, ' '*len(punctuation))
			
 
				+    words = text.translate(translator)
			
 
				+    words = words.lower().split()
			
 
				+    
			
 
				+    # очистка от прилегающего к слову мусора (слово, "или так")
			
 
				+    clear_words = []
			
 
				+    for word in words:
			
 
				+        clear_word = ""
			
 
				+        for s in word:
			
 
				+            if not s in punctuation:
			
 
				+                clear_word = clear_word + s
			
 
				+        clear_words.append(clear_word)
			
 
				+    tokens = []
			
 
				+    tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
			
 
				+            and token != " " \
			
 
				+            and token.strip() not in punctuation \
			
 
				+            ]
			
 
				+
			
 
				+    text = " ".join(tokens)    
			
 
				+    return tokens, text
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+print()
			
 
				+
			
 
				+now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
			
 
				+out_str = f"Время проверки: {now} \n"
			
 
				+# print(out_str)
			
 
				+
			
 
				+response = requests.get(url)
			
 
				+post_html = response.text
			
 
				+
			
 
				+post_tokens, post_uniq_text = preprocess_text(post_html)
			
 
				+print(f"количество уникальных слов: {len(set(post_tokens))}")
			
 
				+print()
			
 
				+
			
 
				+
			
 
				+post_list = post_html.split("\n")
			
 
				+# ищем другие лекции
			
 
				+readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
			
 
				+try:
			
 
				+    with open(readme_path, encoding="utf-8") as f:
			
 
				+        readme_html = f.read()
			
 
				+except:
			
 
				+    with open(readme_path, encoding="cp1251") as f:
			
 
				+        readme_html = f.read()
			
 
				+
			
 
				+min_ratio = 100
			
 
				+min_ratio_file = ""
			
 
				+
			
 
				+readme_list = readme_html.split("\n")
			
 
				+for readme_str in tqdm(readme_list):
			
 
				+    if '[' in readme_str:
			
 
				+        variant_name, t = readme_str.split("]")
			
 
				+        variant_name = variant_name.strip("[")
			
 
				+        # print(f"проверяю {variant_name}")
			
 
				+        t, variant_uri = readme_str.split("(")
			
 
				+        variant_uri = variant_uri.replace("),", "")
			
 
				+        variant_uri = variant_uri.replace(")", "")
			
 
				+        variant_uri = variant_uri.strip()
			
 
				+        
			
 
				+        variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
			
 
				+        try:
			
 
				+            with open(variant_path, encoding="utf-8") as f:
			
 
				+                variant_html = f.read()
			
 
				+        except:
			
 
				+            with open(variant_path, encoding="cp1251") as f:
			
 
				+                variant_html = f.read()
			
 
				+
			
 
				+        variant_tokens, variant_uniq_text = preprocess_text(variant_html)
			
 
				+        # print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
			
 
				+
			
 
				+        # пересечение множеств 
			
 
				+        min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])
			
 
				+        c = list(set(post_tokens) & set(variant_tokens))
			
 
				+        ratio = (1 - (len(c) / min_tokens_len)) * 100
			
 
				+        if ratio < min_ratio:
			
 
				+            min_ratio = ratio
			
 
				+            min_ratio_file = variant_path
			
 
				+
			
 
				+        # print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")
			
 
				+        # print()
			
 
				+
			
 
				+
			
 
				+print()
			
 
				+print(f"min_ratio: {min_ratio}%")
			
 
				+print(f"min_ratio_file: {min_ratio_file}")
			
 
				+print("success")
			
 
				+