|
@@ -0,0 +1,183 @@
|
|
|
+# версия полной проверки с проверкой русской орфографии
|
|
|
+
|
|
|
+import os
|
|
|
+from difflib import SequenceMatcher
|
|
|
+from tqdm import tqdm
|
|
|
+import datetime
|
|
|
+import requests
|
|
|
+
|
|
|
+# download stopwords corpus, you need to run it once
|
|
|
+import nltk
|
|
|
+#nltk.download("stopwords")
|
|
|
+from nltk.corpus import stopwords
|
|
|
+import pymorphy2
|
|
|
+from string import punctuation
|
|
|
+
|
|
|
+
|
|
|
+# ------------------------------- НАСТРОЙКИ ------------
|
|
|
+# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
|
|
|
+BASE_DIR = os.path.abspath(os.path.dirname(__file__))
|
|
|
+# проверяемая директория
|
|
|
+LECTION_DIR = os.path.join(BASE_DIR, "Лекции")
|
|
|
+
|
|
|
+# ссылка для проверки
|
|
|
+url = "http://213.155.192.79:3001/u21liseenko/ISRPO/src/2c90e5020faf186b19cff6b0a35397f45f815f59/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/JavaScript.md"
|
|
|
+
|
|
|
+# ------------------------------- / НАСТРОЙКИ ------------
|
|
|
+
|
|
|
+
|
|
|
+url = url.replace("src", "raw")
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+#Create lemmatizer and stopwords list
|
|
|
+morph = pymorphy2.MorphAnalyzer()
|
|
|
+russian_stopwords = stopwords.words("russian")
|
|
|
+
|
|
|
+#Preprocess function
|
|
|
+def preprocess_text(text):
|
|
|
+ translator = str.maketrans(punctuation, ' '*len(punctuation))
|
|
|
+ words = text.translate(translator)
|
|
|
+ words = words.lower().split()
|
|
|
+
|
|
|
+ # очистка от прилегающего к слову мусора (слово, "или так")
|
|
|
+ clear_words = []
|
|
|
+ for word in words:
|
|
|
+ clear_word = ""
|
|
|
+ for s in word:
|
|
|
+ if not s in punctuation:
|
|
|
+ clear_word = clear_word + s
|
|
|
+ clear_words.append(clear_word)
|
|
|
+ tokens = []
|
|
|
+ tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
|
|
|
+ and token != " " \
|
|
|
+ and token.strip() not in punctuation \
|
|
|
+ ]
|
|
|
+
|
|
|
+ text = " ".join(tokens)
|
|
|
+ return tokens, text
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+#Preprocess function
|
|
|
+import language_tool_python
|
|
|
+tool = language_tool_python.LanguageTool('ru-RU')
|
|
|
+def orfo_text(tokens):
|
|
|
+ bad_tokens_n = 0
|
|
|
+ for token in tokens:
|
|
|
+ matches = tool.check(token)
|
|
|
+ if len(matches)>0:
|
|
|
+ bad_tokens_n += 1
|
|
|
+ #print(matches[0].ruleId)
|
|
|
+
|
|
|
+ return bad_tokens_n
|
|
|
+
|
|
|
+print()
|
|
|
+
|
|
|
+now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
|
|
|
+out_str = f"Время проверки: {now} \n"
|
|
|
+# print(out_str)
|
|
|
+
|
|
|
+response = requests.get(url)
|
|
|
+post_html = response.text
|
|
|
+
|
|
|
+post_list = post_html.split("\n")
|
|
|
+
|
|
|
+# проверяем правильность оформления 1й строки
|
|
|
+header_exist = True
|
|
|
+line_1 = post_list[0].strip()
|
|
|
+line_1 = line_1.replace(chr(65279), "")
|
|
|
+if (line_1[0:2]) != "# ":
|
|
|
+ print(f"Заголовок статьи не найден: '{line_1[0:1]} {line_1[1:2]}' вместо '# '")
|
|
|
+ print(f"{ord(line_1[0:1])} {ord(line_1[1:2])} вместо {ord('#')} {ord(' ')}")
|
|
|
+ header_exist = False
|
|
|
+
|
|
|
+# наличие вопросов и списка литературы
|
|
|
+quest_exist = False
|
|
|
+source_exist = False
|
|
|
+for post_line in post_list:
|
|
|
+ if (post_line[0:2] == "##"):
|
|
|
+ if ("Вопросы" in post_line):
|
|
|
+ quest_exist = True
|
|
|
+ if ("Список" in post_line) and ("литературы" in post_line):
|
|
|
+ source_exist = True
|
|
|
+if not (quest_exist):
|
|
|
+ print("Вопросы не найдены")
|
|
|
+if not (source_exist):
|
|
|
+ print("Список литературы не найден")
|
|
|
+
|
|
|
+
|
|
|
+header_text = line_1.replace("# ", "")
|
|
|
+header_text = header_text.replace(".", "")
|
|
|
+header_text = header_text.strip()
|
|
|
+header_text = header_text.strip()
|
|
|
+print(f"Заголовок: {header_text}")
|
|
|
+
|
|
|
+# ищем другие лекции по этой теме
|
|
|
+readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
|
|
|
+try:
|
|
|
+ with open(readme_path, encoding="utf-8") as f:
|
|
|
+ readme_html = f.read()
|
|
|
+except:
|
|
|
+ with open(readme_path, encoding="cp1251") as f:
|
|
|
+ readme_html = f.read()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+post_tokens, post_uniq_text = preprocess_text(post_html)
|
|
|
+print(f"количество уникальных слов: {len(set(post_tokens))}")
|
|
|
+
|
|
|
+bad_tokens_n = orfo_text(post_tokens)
|
|
|
+bad_tokens_stat = int(bad_tokens_n / len(post_tokens) * 10000) / 100
|
|
|
+print(f"процент ошибок: {bad_tokens_stat}%")
|
|
|
+print()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+min_ratio = 1000
|
|
|
+min_ratio_name = ""
|
|
|
+
|
|
|
+readme_list = readme_html.split("\n")
|
|
|
+for readme_str in tqdm(readme_list):
|
|
|
+ readme_str = readme_str.strip()
|
|
|
+
|
|
|
+ if len(readme_str)>0:
|
|
|
+
|
|
|
+ # строка с ссылкой на лекцию
|
|
|
+ if "[" in readme_str:
|
|
|
+ variant_name, t = readme_str.split("]")
|
|
|
+ variant_name = variant_name.strip("[")
|
|
|
+ t, variant_uri = readme_str.split("(")
|
|
|
+ variant_uri = variant_uri.replace("),", "")
|
|
|
+ variant_uri = variant_uri.replace(")", "")
|
|
|
+ variant_uri = variant_uri.strip()
|
|
|
+
|
|
|
+ if not "youtube" in variant_uri:
|
|
|
+ variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
|
|
|
+ try:
|
|
|
+ with open(variant_path, encoding="utf-8") as f:
|
|
|
+ variant_html = f.read()
|
|
|
+ except:
|
|
|
+ with open(variant_path, encoding="cp1251") as f:
|
|
|
+ variant_html = f.read()
|
|
|
+
|
|
|
+ variant_tokens, variant_uniq_text = preprocess_text(variant_html)
|
|
|
+
|
|
|
+ # пересечение множеств
|
|
|
+ min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])
|
|
|
+ c = list(set(post_tokens) & set(variant_tokens))
|
|
|
+ ratio = (1 - (len(c) / min_tokens_len)) * 100
|
|
|
+
|
|
|
+ if min_ratio > ratio:
|
|
|
+ min_ratio = ratio
|
|
|
+ min_ratio_name = readme_str
|
|
|
+
|
|
|
+print(f"min_ratio: {min_ratio:.2f}%")
|
|
|
+print(f"{min_ratio_name}")
|