u21-25sivolobova
/
TZI
フォーク元 ypv/TZI


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
							# версия полной проверки с проверкой русской орфографии

import os
from difflib import SequenceMatcher
from tqdm import tqdm
import datetime
import requests

# download stopwords corpus, you need to run it once
import nltk
#nltk.download("stopwords")
from nltk.corpus import stopwords
import pymorphy2
from string import punctuation


# ------------------------------- НАСТРОЙКИ ------------
# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
# проверяемая директория
LECTION_DIR = os.path.join("Лекции", "ПМ3.2")
final_filename = "bad_07Dec23_ПМ3.2_2.txt"
start_line = 63


#Create lemmatizer and stopwords list
morph = pymorphy2.MorphAnalyzer()
russian_stopwords = stopwords.words("russian")

#Preprocess function
def preprocess_text(text):
    translator = str.maketrans(punctuation, ' '*len(punctuation))
    words = text.translate(translator)
    words = words.lower().split()
    
    # очистка от прилегающего к слову мусора (слово, "или так")
    clear_words = []
    for word in words:
        clear_word = ""
        for s in word:
            if not s in punctuation:
                clear_word = clear_word + s
        clear_words.append(clear_word)
    tokens = []
    tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
            and token != " " \
            and token.strip() not in punctuation \
            ]

    text = " ".join(tokens)    
    return tokens, text


#Preprocess function
import language_tool_python
tool = language_tool_python.LanguageTool('ru-RU')
def orfo_text(tokens):
    bad_tokens_n = 0
    for token in tokens:
        matches = tool.check(token)
        if len(matches)>0:
            bad_tokens_n += 1
            #print(matches[0].ruleId)

    return bad_tokens_n
    
# ищем другие лекции по этой теме
readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
try:
    with open(readme_path, encoding="utf-8") as f:
        readme_html = f.read()
except:
    with open(readme_path, encoding="cp1251") as f:
        readme_html = f.read()


"""
█    █    █████    ███████
█    █   ██   ██   ██    ██
 █  █    ███████   ███████
 █  █    ██   ██   ██  ██
  ██     ██   ██   ██    ██
"""


bad_variants_text = ""
lection_name_str = ""
readme_list = readme_html.split("\n")
for readme_str in readme_list[start_line:]:
    readme_str = readme_str.strip()
    readme_str_list = readme_str.split(" ")

    if "[" in readme_str:
        variant_name, t = readme_str.split("]")
        variant_name = variant_name.strip("[")
        print(f"проверяю: {variant_name} / {lection_name_str}")
        t, variant_uri = readme_str.split("(")
        variant_uri = variant_uri.replace("),", "")
        variant_uri = variant_uri.replace(")", "")
        variant_uri = variant_uri.strip()
            
        if "youtube" in variant_uri:
            print("youtube - не проверяем")
            print()
        else:
            variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
            if os.path.isfile(variant_path):
                try:
                    with open(variant_path, encoding="utf-8") as f:
                        variant_html = f.read()
                except:
                    with open(variant_path, encoding="cp1251") as f:
                        variant_html = f.read()

                variant_tokens, variant_uniq_text = preprocess_text(variant_html)
                print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")

                bad_tokens_n = orfo_text(variant_tokens)
                bad_tokens_stat = int(bad_tokens_n / len(variant_tokens) * 10000) / 100
                print(f"процент ошибок: {bad_tokens_stat}%")

                bad_variants_text += f"{lection_name_str}\n{variant_name}: {bad_tokens_stat}\n\n"
            else:
                bad_variants_text += f"!!! {lection_name_str}\n{variant_name}: Файла нет\n"
            with open(final_filename, "w", encoding="utf-8") as f:
                f.write(bad_variants_text)
        
    else:
        lection_name_str = readme_str