Browse Source

plagiat_1_full.v2

ypv 2 years ago
parent
commit
b0dd340bb0
2 changed files with 143 additions and 3 deletions
  1. 3 3
      plagiat_1.v2.py
  2. 140 0
      plagiat_1_full.v2.py

+ 3 - 3
plagiat_1.v2.py

@@ -18,11 +18,11 @@ from thefuzz import fuzz
 BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
 # проверяемая директория
 # LECTION_DIR = os.path.join("ISRPO", "Лекции")
-LECTION_DIR = os.path.join("EASvZI", "Лекции")
-# LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
+# LECTION_DIR = os.path.join("EASvZI", "Лекции")
+LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
 
 # ссылка для проверки
-url = "http://213.155.192.79:3001/u20-24osipenko/EASvZI/raw/14b6dd3163277c86ece71dd677d8c571b0623e9e/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.7.100_%d0%9e%d0%b1%d1%89%d0%b8%d0%b5_%d1%82%d1%80%d0%b5%d0%b1%d0%be%d0%b2%d0%b0%d0%bd%d0%b8%d1%8f_%d0%bf%d0%be_%d0%b7%d0%b0%d1%89%d0%b8%d1%82%d0%b5_%d0%bf%d0%b5%d1%80%d1%81%d0%be%d0%bd%d0%b0%d0%bb%d1%8c%d0%bd%d1%8b%d1%85_%d0%b4%d0%b0%d0%bd%d0%bd%d1%8b%d1%85/1.7.100_%d0%9e%d0%b1%d1%89%d0%b8%d0%b5_%d1%82%d1%80%d0%b5%d0%b1%d0%be%d0%b2%d0%b0%d0%bd%d0%b8%d1%8f_%d0%bf%d0%be_%d0%b7%d0%b0%d1%89%d0%b8%d1%82%d0%b5_%d0%bf%d0%b5%d1%80%d1%81%d0%be%d0%bd%d0%b0%d0%bb%d1%8c%d0%bd%d1%8b%d1%85_%d0%b4%d0%b0%d0%bd%d0%bd%d1%8b%d1%85..md"
+url = "http://213.155.192.79:3001/u20-24goncharov/EASvZI/raw/9bfc4f68f89672173113ce27c707265bac180874/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.2.300_%d0%a1%d1%82%d0%b0%d0%b4%d0%b8%d0%b8_%d0%b6%d0%b8%d0%b7%d0%bd%d0%b5%d0%bd%d0%bd%d0%be%d0%b3%d0%be_%d1%86%d0%b8%d0%ba%d0%bb%d0%b0_%d0%90%d0%98%d0%a1/Goncharov.md"
 
 # ------------------------------- / НАСТРОЙКИ ------------
 

+ 140 - 0
plagiat_1_full.v2.py

@@ -0,0 +1,140 @@
+# Проверка одной со всеми из readme (для диф.зачетов)
+
+import os
+from difflib import SequenceMatcher
+from tqdm import tqdm
+import datetime
+import requests
+
+# download stopwords corpus, you need to run it once
+import nltk
+#nltk.download("stopwords")
+from nltk.corpus import stopwords
+import pymorphy2
+from string import punctuation
+
+from tqdm import tqdm
+
+# ------------------------------- НАСТРОЙКИ ------------
+# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
+BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+# проверяемая директория
+# LECTION_DIR = os.path.join("ISRPO", "Лекции")
+# LECTION_DIR = os.path.join("EASvZI", "Лекции")
+LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
+
+# ссылка для проверки
+url = "http://213.155.192.79:3001/Chubarov/TZI/raw/299653167a99931f26f9db2256ec10cf37973dc9/2022-23/%d0%94%d0%b8%d1%84.%d0%b7%d0%b0%d1%87%d0%b5%d1%82_2%d1%81%d0%b5%d0%bc/%d0%92%d0%be%d0%bf%d1%80%d0%be%d1%81_3_%d1%80%d0%b0%d0%b7%d0%b4%d0%b5%d0%bb_2.md"
+
+# ------------------------------- / НАСТРОЙКИ ------------
+
+
+
+
+
+
+
+
+
+
+
+
+
+#Create lemmatizer and stopwords list
+morph = pymorphy2.MorphAnalyzer()
+russian_stopwords = stopwords.words("russian")
+
+#Preprocess function
+def preprocess_text(text):
+    translator = str.maketrans(punctuation, ' '*len(punctuation))
+    words = text.translate(translator)
+    words = words.lower().split()
+    
+    # очистка от прилегающего к слову мусора (слово, "или так")
+    clear_words = []
+    for word in words:
+        clear_word = ""
+        for s in word:
+            if not s in punctuation:
+                clear_word = clear_word + s
+        clear_words.append(clear_word)
+    tokens = []
+    tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
+            and token != " " \
+            and token.strip() not in punctuation \
+            ]
+
+    text = " ".join(tokens)    
+    return tokens, text
+
+
+
+
+
+
+print()
+
+now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
+out_str = f"Время проверки: {now} \n"
+# print(out_str)
+
+response = requests.get(url)
+post_html = response.text
+
+post_tokens, post_uniq_text = preprocess_text(post_html)
+print(f"количество уникальных слов: {len(set(post_tokens))}")
+print()
+
+
+post_list = post_html.split("\n")
+# ищем другие лекции
+readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
+try:
+    with open(readme_path, encoding="utf-8") as f:
+        readme_html = f.read()
+except:
+    with open(readme_path, encoding="cp1251") as f:
+        readme_html = f.read()
+
+min_ratio = 100
+min_ratio_file = ""
+
+readme_list = readme_html.split("\n")
+for readme_str in tqdm(readme_list):
+    if '[' in readme_str:
+        variant_name, t = readme_str.split("]")
+        variant_name = variant_name.strip("[")
+        # print(f"проверяю {variant_name}")
+        t, variant_uri = readme_str.split("(")
+        variant_uri = variant_uri.replace("),", "")
+        variant_uri = variant_uri.replace(")", "")
+        variant_uri = variant_uri.strip()
+        
+        variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
+        try:
+            with open(variant_path, encoding="utf-8") as f:
+                variant_html = f.read()
+        except:
+            with open(variant_path, encoding="cp1251") as f:
+                variant_html = f.read()
+
+        variant_tokens, variant_uniq_text = preprocess_text(variant_html)
+        # print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
+
+        # пересечение множеств 
+        min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])
+        c = list(set(post_tokens) & set(variant_tokens))
+        ratio = (1 - (len(c) / min_tokens_len)) * 100
+        if ratio < min_ratio:
+            min_ratio = ratio
+            min_ratio_file = variant_path
+
+        # print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")
+        # print()
+
+
+print()
+print(f"min_ratio: {min_ratio}%")
+print(f"min_ratio_file: {min_ratio_file}")
+print("success")
+