# версия полной проверки с проверкой русской орфографии import os from difflib import SequenceMatcher from tqdm import tqdm import datetime import requests # download stopwords corpus, you need to run it once import nltk #nltk.download("stopwords") from nltk.corpus import stopwords import pymorphy2 from string import punctuation # ------------------------------- НАСТРОЙКИ ------------ # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23) BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) # проверяемая директория LECTION_DIR = os.path.join("EASvZI", "Лекции") #Create lemmatizer and stopwords list morph = pymorphy2.MorphAnalyzer() russian_stopwords = stopwords.words("russian") #Preprocess function def preprocess_text(text): translator = str.maketrans(punctuation, ' '*len(punctuation)) words = text.translate(translator) words = words.lower().split() # очистка от прилегающего к слову мусора (слово, "или так") clear_words = [] for word in words: clear_word = "" for s in word: if not s in punctuation: clear_word = clear_word + s clear_words.append(clear_word) tokens = [] tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\ and token != " " \ and token.strip() not in punctuation \ ] text = " ".join(tokens) return tokens, text #Preprocess function import language_tool_python tool = language_tool_python.LanguageTool('ru-RU') def orfo_text(tokens): bad_tokens_n = 0 for token in tokens: matches = tool.check(token) if len(matches)>0: bad_tokens_n += 1 #print(matches[0].ruleId) return bad_tokens_n # ищем другие лекции по этой теме readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md") try: with open(readme_path, encoding="utf-8") as f: readme_html = f.read() except: with open(readme_path, encoding="cp1251") as f: readme_html = f.read() """ █ █ █████ ███████ █ █ ██ ██ ██ ██ █ █ ███████ ███████ █ █ ██ ██ ██ ██ ██ ██ ██ ██ ██ """ bad_variants_text = "" lection_name_str = "" readme_list = readme_html.split("\n") for readme_str in readme_list[253:]: readme_str = readme_str.strip() readme_str_list = readme_str.split(" ") if "[" in readme_str: variant_name, t = readme_str.split("]") variant_name = variant_name.strip("[") print(f"проверяю: {variant_name} / {lection_name_str}") t, variant_uri = readme_str.split("(") variant_uri = variant_uri.replace("),", "") variant_uri = variant_uri.replace(")", "") variant_uri = variant_uri.strip() if "youtube" in variant_uri: print("youtube - не проверяем") print() else: variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri) if os.path.isfile(variant_path): try: with open(variant_path, encoding="utf-8") as f: variant_html = f.read() except: with open(variant_path, encoding="cp1251") as f: variant_html = f.read() variant_tokens, variant_uniq_text = preprocess_text(variant_html) print(f"количество уникальных слов варианта: {len(set(variant_tokens))}") bad_tokens_n = orfo_text(variant_tokens) bad_tokens_stat = int(bad_tokens_n / len(variant_tokens) * 10000) / 100 print(f"процент ошибок: {bad_tokens_stat}%") bad_variants_text += f"{lection_name_str}\n{variant_name}: {bad_tokens_stat}\n\n" else: bad_variants_text += f"!!! {lection_name_str}\n{variant_name}: Файла нет" with open("bad_variants_1.txt", "w", encoding="utf-8") as f: f.write(bad_variants_text) else: lection_name_str = readme_str