123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- # версия полной проверки с проверкой русской орфографии
- import os
- from difflib import SequenceMatcher
- from tqdm import tqdm
- import datetime
- import requests
- # download stopwords corpus, you need to run it once
- import nltk
- #nltk.download("stopwords")
- from nltk.corpus import stopwords
- import pymorphy2
- from string import punctuation
- # ------------------------------- НАСТРОЙКИ ------------
- # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
- BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
- # проверяемая директория
- LECTION_DIR = os.path.join("EASvZI", "Лекции")
- #Create lemmatizer and stopwords list
- morph = pymorphy2.MorphAnalyzer()
- russian_stopwords = stopwords.words("russian")
- #Preprocess function
- def preprocess_text(text):
- translator = str.maketrans(punctuation, ' '*len(punctuation))
- words = text.translate(translator)
- words = words.lower().split()
-
- # очистка от прилегающего к слову мусора (слово, "или так")
- clear_words = []
- for word in words:
- clear_word = ""
- for s in word:
- if not s in punctuation:
- clear_word = clear_word + s
- clear_words.append(clear_word)
- tokens = []
- tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
- and token != " " \
- and token.strip() not in punctuation \
- ]
- text = " ".join(tokens)
- return tokens, text
- #Preprocess function
- import language_tool_python
- tool = language_tool_python.LanguageTool('ru-RU')
- def orfo_text(tokens):
- bad_tokens_n = 0
- for token in tokens:
- matches = tool.check(token)
- if len(matches)>0:
- bad_tokens_n += 1
- #print(matches[0].ruleId)
- return bad_tokens_n
-
- # ищем другие лекции по этой теме
- readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
- try:
- with open(readme_path, encoding="utf-8") as f:
- readme_html = f.read()
- except:
- with open(readme_path, encoding="cp1251") as f:
- readme_html = f.read()
- """
- █ █ █████ ███████
- █ █ ██ ██ ██ ██
- █ █ ███████ ███████
- █ █ ██ ██ ██ ██
- ██ ██ ██ ██ ██
- """
- bad_variants_text = ""
- lection_name_str = ""
- readme_list = readme_html.split("\n")
- for readme_str in readme_list[253:]:
- readme_str = readme_str.strip()
- readme_str_list = readme_str.split(" ")
- if "[" in readme_str:
- variant_name, t = readme_str.split("]")
- variant_name = variant_name.strip("[")
- print(f"проверяю: {variant_name} / {lection_name_str}")
- t, variant_uri = readme_str.split("(")
- variant_uri = variant_uri.replace("),", "")
- variant_uri = variant_uri.replace(")", "")
- variant_uri = variant_uri.strip()
-
- if "youtube" in variant_uri:
- print("youtube - не проверяем")
- print()
- else:
- variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
- if os.path.isfile(variant_path):
- try:
- with open(variant_path, encoding="utf-8") as f:
- variant_html = f.read()
- except:
- with open(variant_path, encoding="cp1251") as f:
- variant_html = f.read()
- variant_tokens, variant_uniq_text = preprocess_text(variant_html)
- print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
- bad_tokens_n = orfo_text(variant_tokens)
- bad_tokens_stat = int(bad_tokens_n / len(variant_tokens) * 10000) / 100
- print(f"процент ошибок: {bad_tokens_stat}%")
- bad_variants_text += f"{lection_name_str}\n{variant_name}: {bad_tokens_stat}\n\n"
- else:
- bad_variants_text += f"!!! {lection_name_str}\n{variant_name}: Файла нет"
- with open("bad_variants_1.txt", "w", encoding="utf-8") as f:
- f.write(bad_variants_text)
-
- else:
- lection_name_str = readme_str
|