| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142 | # Проверка одной со всеми из readme (для диф.зачетов)import osfrom difflib import SequenceMatcherfrom tqdm import tqdmimport datetimeimport requests# download stopwords corpus, you need to run it onceimport nltk#nltk.download("stopwords")from nltk.corpus import stopwordsimport pymorphy2from string import punctuationfrom tqdm import tqdm# ------------------------------- НАСТРОЙКИ ------------# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))# проверяемая директория# LECTION_DIR = os.path.join("ISRPO", "Лекции")LECTION_DIR = os.path.join("EASvZI", "Лекции")# LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")# ссылка для проверкиurl = "http://213.155.192.79:3001/u20-24osipenko/EASvZI/raw/7ce0ad4bc82db893c3370aa7c7bb3866ac2b9f84/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/2.2.200_%d0%90%d0%b2%d1%82%d0%be%d0%bc%d0%b0%d1%82%d0%b8%d0%b7%d0%b0%d1%86%d0%b8%d1%8f_%d1%83%d0%bf%d1%80%d0%b0%d0%b2%d0%bb%d0%b5%d0%bd%d0%b8%d1%8f_%d1%81%d0%b5%d1%82%d1%8c%d1%8e/2.2.200_%d0%90%d0%b2%d1%82%d0%be%d0%bc%d0%b0%d1%82%d0%b8%d0%b7%d0%b0%d1%86%d0%b8%d1%8f_%d1%83%d0%bf%d1%80%d0%b0%d0%b2%d0%bb%d0%b5%d0%bd%d0%b8%d1%8f_%d1%81%d0%b5%d1%82%d1%8c%d1%8e.md"# ------------------------------- / НАСТРОЙКИ ------------#Create lemmatizer and stopwords listmorph = pymorphy2.MorphAnalyzer()russian_stopwords = stopwords.words("russian")#Preprocess functiondef preprocess_text(text):    translator = str.maketrans(punctuation, ' '*len(punctuation))    words = text.translate(translator)    words = words.lower().split()        # очистка от прилегающего к слову мусора (слово, "или так")    clear_words = []    for word in words:        clear_word = ""        for s in word:            if not s in punctuation:                clear_word = clear_word + s        clear_words.append(clear_word)    tokens = []    tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\            and token != " " \            and token.strip() not in punctuation \            ]    text = " ".join(tokens)        return tokens, textprint()now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')out_str = f"Время проверки: {now} \n"# print(out_str)response = requests.get(url)post_html = response.textpost_tokens, post_uniq_text = preprocess_text(post_html)print(f"количество уникальных слов: {len(set(post_tokens))}")print()post_list = post_html.split("\n")# ищем другие лекцииreadme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")try:    with open(readme_path, encoding="utf-8") as f:        readme_html = f.read()except:    with open(readme_path, encoding="cp1251") as f:        readme_html = f.read()min_ratio = 100min_ratio_file = ""readme_list = readme_html.split("\n")for readme_str in tqdm(readme_list):    if '[' in readme_str:        # print(f"проверяю {readme_str}")        try:            variant_name, t = readme_str.split("]")            variant_name = variant_name.strip("[")            t, variant_uri = readme_str.split("(")            variant_uri = variant_uri.replace("),", "")            variant_uri = variant_uri.replace(")", "")            variant_uri = variant_uri.strip()                        variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)            try:                with open(variant_path, encoding="utf-8") as f:                    variant_html = f.read()            except:                with open(variant_path, encoding="cp1251") as f:                    variant_html = f.read()            variant_tokens, variant_uniq_text = preprocess_text(variant_html)            # print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")            # пересечение множеств             min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])            c = list(set(post_tokens) & set(variant_tokens))            ratio = (1 - (len(c) / min_tokens_len)) * 100            if ratio < min_ratio:                min_ratio = ratio                min_ratio_file = variant_path            # print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")            # print()        except:            print(f"Ошибка распаковки {readme_str}")print()print(f"min_ratio: {min_ratio}%")print(f"min_ratio_file: {min_ratio_file}")print("success")
 |