# версия полной проверки с проверкой русской орфографии import os from difflib import SequenceMatcher from tqdm import tqdm import datetime import requests # download stopwords corpus, you need to run it once import nltk #nltk.download("stopwords") from nltk.corpus import stopwords import pymorphy2 from string import punctuation # ------------------------------- НАСТРОЙКИ ------------ # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23) BASE_DIR = os.path.abspath(os.path.dirname(__file__)) # проверяемая директория LECTION_DIR = os.path.join(BASE_DIR, "Лекции") # ссылка для проверки url = "http://213.155.192.79:3001/u21deev/ISRPO_Deev/src/3cf20668f4a06cec9cb60377eca8fd575ebb67a6/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/DeevStartUP.md" # ------------------------------- / НАСТРОЙКИ ------------ url = url.replace("src", "raw") #Create lemmatizer and stopwords list morph = pymorphy2.MorphAnalyzer() russian_stopwords = stopwords.words("russian") #Preprocess function def preprocess_text(text): translator = str.maketrans(punctuation, ' '*len(punctuation)) words = text.translate(translator) words = words.lower().split() # очистка от прилегающего к слову мусора (слово, "или так") clear_words = [] for word in words: clear_word = "" for s in word: if not s in punctuation: clear_word = clear_word + s clear_words.append(clear_word) tokens = [] tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\ and token != " " \ and token.strip() not in punctuation \ ] text = " ".join(tokens) return tokens, text #Preprocess function import language_tool_python tool = language_tool_python.LanguageTool('ru-RU') def orfo_text(tokens): bad_tokens_n = 0 for token in tokens: matches = tool.check(token) if len(matches)>0: bad_tokens_n += 1 #print(matches[0].ruleId) return bad_tokens_n print() now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M') out_str = f"Время проверки: {now} \n" # print(out_str) response = requests.get(url) post_html = response.text post_list = post_html.split("\n") # проверяем правильность оформления 1й строки header_exist = True line_1 = post_list[0].strip() line_1 = line_1.replace(chr(65279), "") if (line_1[0:2]) != "# ": print(f"Заголовок статьи не найден: '{line_1[0:1]} {line_1[1:2]}' вместо '# '") print(f"{ord(line_1[0:1])} {ord(line_1[1:2])} вместо {ord('#')} {ord(' ')}") header_exist = False # наличие вопросов и списка литературы quest_exist = False source_exist = False for post_line in post_list: if (post_line[0:2] == "##"): if ("Вопросы" in post_line): quest_exist = True if ("Список" in post_line) and ("литературы" in post_line): source_exist = True if not (quest_exist): print("Вопросы не найдены") if not (source_exist): print("Список литературы не найден") header_text = line_1.replace("# ", "") header_text = header_text.replace(".", "") header_text = header_text.strip() header_text = header_text.strip() print(f"Заголовок: {header_text}") # ищем другие лекции по этой теме readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md") try: with open(readme_path, encoding="utf-8") as f: readme_html = f.read() except: with open(readme_path, encoding="cp1251") as f: readme_html = f.read() post_tokens, post_uniq_text = preprocess_text(post_html) print(f"количество уникальных слов: {len(set(post_tokens))}") bad_tokens_n = orfo_text(post_tokens) bad_tokens_stat = int(bad_tokens_n / len(post_tokens) * 10000) / 100 print(f"процент ошибок: {bad_tokens_stat}%") print() min_ratio = 1000 min_ratio_name = "" readme_list = readme_html.split("\n") for readme_str in tqdm(readme_list): readme_str = readme_str.strip() if len(readme_str)>0: # строка с ссылкой на лекцию if "[" in readme_str: variant_name, t = readme_str.split("]") variant_name = variant_name.strip("[") t, variant_uri = readme_str.split("(") variant_uri = variant_uri.replace("),", "") variant_uri = variant_uri.replace(")", "") variant_uri = variant_uri.strip() if (not "youtube" in variant_uri) and (not "habr" in variant_uri): variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri) try: with open(variant_path, encoding="utf-8") as f: variant_html = f.read() except: with open(variant_path, encoding="cp1251") as f: variant_html = f.read() variant_tokens, variant_uniq_text = preprocess_text(variant_html) # пересечение множеств min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))]) c = list(set(post_tokens) & set(variant_tokens)) ratio = (1 - (len(c) / min_tokens_len)) * 100 if min_ratio > ratio: min_ratio = ratio min_ratio_name = readme_str print(f"min_ratio: {min_ratio:.2f}%") print(f"{min_ratio_name}")