|
@@ -0,0 +1,140 @@
|
|
|
+# Проверка одной со всеми из readme (для диф.зачетов)
|
|
|
+
|
|
|
+import os
|
|
|
+from difflib import SequenceMatcher
|
|
|
+from tqdm import tqdm
|
|
|
+import datetime
|
|
|
+import requests
|
|
|
+
|
|
|
+# download stopwords corpus, you need to run it once
|
|
|
+import nltk
|
|
|
+#nltk.download("stopwords")
|
|
|
+from nltk.corpus import stopwords
|
|
|
+import pymorphy2
|
|
|
+from string import punctuation
|
|
|
+
|
|
|
+from tqdm import tqdm
|
|
|
+
|
|
|
+# ------------------------------- НАСТРОЙКИ ------------
|
|
|
+# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
|
|
|
+BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
|
|
|
+# проверяемая директория
|
|
|
+# LECTION_DIR = os.path.join("ISRPO", "Лекции")
|
|
|
+# LECTION_DIR = os.path.join("EASvZI", "Лекции")
|
|
|
+LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
|
|
|
+
|
|
|
+# ссылка для проверки
|
|
|
+url = "http://213.155.192.79:3001/Chubarov/TZI/raw/299653167a99931f26f9db2256ec10cf37973dc9/2022-23/%d0%94%d0%b8%d1%84.%d0%b7%d0%b0%d1%87%d0%b5%d1%82_2%d1%81%d0%b5%d0%bc/%d0%92%d0%be%d0%bf%d1%80%d0%be%d1%81_3_%d1%80%d0%b0%d0%b7%d0%b4%d0%b5%d0%bb_2.md"
|
|
|
+
|
|
|
+# ------------------------------- / НАСТРОЙКИ ------------
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+#Create lemmatizer and stopwords list
|
|
|
+morph = pymorphy2.MorphAnalyzer()
|
|
|
+russian_stopwords = stopwords.words("russian")
|
|
|
+
|
|
|
+#Preprocess function
|
|
|
+def preprocess_text(text):
|
|
|
+ translator = str.maketrans(punctuation, ' '*len(punctuation))
|
|
|
+ words = text.translate(translator)
|
|
|
+ words = words.lower().split()
|
|
|
+
|
|
|
+ # очистка от прилегающего к слову мусора (слово, "или так")
|
|
|
+ clear_words = []
|
|
|
+ for word in words:
|
|
|
+ clear_word = ""
|
|
|
+ for s in word:
|
|
|
+ if not s in punctuation:
|
|
|
+ clear_word = clear_word + s
|
|
|
+ clear_words.append(clear_word)
|
|
|
+ tokens = []
|
|
|
+ tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
|
|
|
+ and token != " " \
|
|
|
+ and token.strip() not in punctuation \
|
|
|
+ ]
|
|
|
+
|
|
|
+ text = " ".join(tokens)
|
|
|
+ return tokens, text
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+print()
|
|
|
+
|
|
|
+now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
|
|
|
+out_str = f"Время проверки: {now} \n"
|
|
|
+# print(out_str)
|
|
|
+
|
|
|
+response = requests.get(url)
|
|
|
+post_html = response.text
|
|
|
+
|
|
|
+post_tokens, post_uniq_text = preprocess_text(post_html)
|
|
|
+print(f"количество уникальных слов: {len(set(post_tokens))}")
|
|
|
+print()
|
|
|
+
|
|
|
+
|
|
|
+post_list = post_html.split("\n")
|
|
|
+# ищем другие лекции
|
|
|
+readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
|
|
|
+try:
|
|
|
+ with open(readme_path, encoding="utf-8") as f:
|
|
|
+ readme_html = f.read()
|
|
|
+except:
|
|
|
+ with open(readme_path, encoding="cp1251") as f:
|
|
|
+ readme_html = f.read()
|
|
|
+
|
|
|
+min_ratio = 100
|
|
|
+min_ratio_file = ""
|
|
|
+
|
|
|
+readme_list = readme_html.split("\n")
|
|
|
+for readme_str in tqdm(readme_list):
|
|
|
+ if '[' in readme_str:
|
|
|
+ variant_name, t = readme_str.split("]")
|
|
|
+ variant_name = variant_name.strip("[")
|
|
|
+ # print(f"проверяю {variant_name}")
|
|
|
+ t, variant_uri = readme_str.split("(")
|
|
|
+ variant_uri = variant_uri.replace("),", "")
|
|
|
+ variant_uri = variant_uri.replace(")", "")
|
|
|
+ variant_uri = variant_uri.strip()
|
|
|
+
|
|
|
+ variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
|
|
|
+ try:
|
|
|
+ with open(variant_path, encoding="utf-8") as f:
|
|
|
+ variant_html = f.read()
|
|
|
+ except:
|
|
|
+ with open(variant_path, encoding="cp1251") as f:
|
|
|
+ variant_html = f.read()
|
|
|
+
|
|
|
+ variant_tokens, variant_uniq_text = preprocess_text(variant_html)
|
|
|
+ # print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
|
|
|
+
|
|
|
+ # пересечение множеств
|
|
|
+ min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])
|
|
|
+ c = list(set(post_tokens) & set(variant_tokens))
|
|
|
+ ratio = (1 - (len(c) / min_tokens_len)) * 100
|
|
|
+ if ratio < min_ratio:
|
|
|
+ min_ratio = ratio
|
|
|
+ min_ratio_file = variant_path
|
|
|
+
|
|
|
+ # print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")
|
|
|
+ # print()
|
|
|
+
|
|
|
+
|
|
|
+print()
|
|
|
+print(f"min_ratio: {min_ratio}%")
|
|
|
+print(f"min_ratio_file: {min_ratio_file}")
|
|
|
+print("success")
|
|
|
+
|