| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134 | 
							- # версия полной проверки с проверкой русской орфографии
 
- import os
 
- from difflib import SequenceMatcher
 
- from tqdm import tqdm
 
- import datetime
 
- import requests
 
- # download stopwords corpus, you need to run it once
 
- import nltk
 
- #nltk.download("stopwords")
 
- from nltk.corpus import stopwords
 
- import pymorphy2
 
- from string import punctuation
 
- # ------------------------------- НАСТРОЙКИ ------------
 
- # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
 
- BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
 
- # проверяемая директория
 
- LECTION_DIR = os.path.join("Лекции", "ПМ3.2")
 
- final_filename = "bad_07Dec23_ПМ3.2_2.txt"
 
- start_line = 63
 
- #Create lemmatizer and stopwords list
 
- morph = pymorphy2.MorphAnalyzer()
 
- russian_stopwords = stopwords.words("russian")
 
- #Preprocess function
 
- def preprocess_text(text):
 
-     translator = str.maketrans(punctuation, ' '*len(punctuation))
 
-     words = text.translate(translator)
 
-     words = words.lower().split()
 
-     
 
-     # очистка от прилегающего к слову мусора (слово, "или так")
 
-     clear_words = []
 
-     for word in words:
 
-         clear_word = ""
 
-         for s in word:
 
-             if not s in punctuation:
 
-                 clear_word = clear_word + s
 
-         clear_words.append(clear_word)
 
-     tokens = []
 
-     tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
 
-             and token != " " \
 
-             and token.strip() not in punctuation \
 
-             ]
 
-     text = " ".join(tokens)    
 
-     return tokens, text
 
- #Preprocess function
 
- import language_tool_python
 
- tool = language_tool_python.LanguageTool('ru-RU')
 
- def orfo_text(tokens):
 
-     bad_tokens_n = 0
 
-     for token in tokens:
 
-         matches = tool.check(token)
 
-         if len(matches)>0:
 
-             bad_tokens_n += 1
 
-             #print(matches[0].ruleId)
 
-     return bad_tokens_n
 
-     
 
- # ищем другие лекции по этой теме
 
- readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
 
- try:
 
-     with open(readme_path, encoding="utf-8") as f:
 
-         readme_html = f.read()
 
- except:
 
-     with open(readme_path, encoding="cp1251") as f:
 
-         readme_html = f.read()
 
- """
 
- █    █    █████    ███████
 
- █    █   ██   ██   ██    ██
 
-  █  █    ███████   ███████
 
-  █  █    ██   ██   ██  ██
 
-   ██     ██   ██   ██    ██
 
- """
 
- bad_variants_text = ""
 
- lection_name_str = ""
 
- readme_list = readme_html.split("\n")
 
- for readme_str in readme_list[start_line:]:
 
-     readme_str = readme_str.strip()
 
-     readme_str_list = readme_str.split(" ")
 
-     if "[" in readme_str:
 
-         variant_name, t = readme_str.split("]")
 
-         variant_name = variant_name.strip("[")
 
-         print(f"проверяю: {variant_name} / {lection_name_str}")
 
-         t, variant_uri = readme_str.split("(")
 
-         variant_uri = variant_uri.replace("),", "")
 
-         variant_uri = variant_uri.replace(")", "")
 
-         variant_uri = variant_uri.strip()
 
-             
 
-         if "youtube" in variant_uri:
 
-             print("youtube - не проверяем")
 
-             print()
 
-         else:
 
-             variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
 
-             if os.path.isfile(variant_path):
 
-                 try:
 
-                     with open(variant_path, encoding="utf-8") as f:
 
-                         variant_html = f.read()
 
-                 except:
 
-                     with open(variant_path, encoding="cp1251") as f:
 
-                         variant_html = f.read()
 
-                 variant_tokens, variant_uniq_text = preprocess_text(variant_html)
 
-                 print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
 
-                 bad_tokens_n = orfo_text(variant_tokens)
 
-                 bad_tokens_stat = int(bad_tokens_n / len(variant_tokens) * 10000) / 100
 
-                 print(f"процент ошибок: {bad_tokens_stat}%")
 
-                 bad_variants_text += f"{lection_name_str}\n{variant_name}: {bad_tokens_stat}\n\n"
 
-             else:
 
-                 bad_variants_text += f"!!! {lection_name_str}\n{variant_name}: Файла нет\n"
 
-             with open(final_filename, "w", encoding="utf-8") as f:
 
-                 f.write(bad_variants_text)
 
-         
 
-     else:
 
-         lection_name_str = readme_str
 
 
  |