# Проверка одной со всеми из readme (для диф.зачетов) import os from difflib import SequenceMatcher from tqdm import tqdm import datetime import requests # download stopwords corpus, you need to run it once import nltk #nltk.download("stopwords") from nltk.corpus import stopwords import pymorphy2 from string import punctuation from tqdm import tqdm # ------------------------------- НАСТРОЙКИ ------------ # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23) BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) # проверяемая директория # LECTION_DIR = os.path.join("ISRPO", "Лекции") LECTION_DIR = os.path.join("EASvZI", "Лекции") # LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2") # ссылка для проверки url = "http://213.155.192.79:3001/u20-24osipenko/EASvZI/raw/7ce0ad4bc82db893c3370aa7c7bb3866ac2b9f84/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/2.2.200_%d0%90%d0%b2%d1%82%d0%be%d0%bc%d0%b0%d1%82%d0%b8%d0%b7%d0%b0%d1%86%d0%b8%d1%8f_%d1%83%d0%bf%d1%80%d0%b0%d0%b2%d0%bb%d0%b5%d0%bd%d0%b8%d1%8f_%d1%81%d0%b5%d1%82%d1%8c%d1%8e/2.2.200_%d0%90%d0%b2%d1%82%d0%be%d0%bc%d0%b0%d1%82%d0%b8%d0%b7%d0%b0%d1%86%d0%b8%d1%8f_%d1%83%d0%bf%d1%80%d0%b0%d0%b2%d0%bb%d0%b5%d0%bd%d0%b8%d1%8f_%d1%81%d0%b5%d1%82%d1%8c%d1%8e.md" # ------------------------------- / НАСТРОЙКИ ------------ #Create lemmatizer and stopwords list morph = pymorphy2.MorphAnalyzer() russian_stopwords = stopwords.words("russian") #Preprocess function def preprocess_text(text): translator = str.maketrans(punctuation, ' '*len(punctuation)) words = text.translate(translator) words = words.lower().split() # очистка от прилегающего к слову мусора (слово, "или так") clear_words = [] for word in words: clear_word = "" for s in word: if not s in punctuation: clear_word = clear_word + s clear_words.append(clear_word) tokens = [] tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\ and token != " " \ and token.strip() not in punctuation \ ] text = " ".join(tokens) return tokens, text print() now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M') out_str = f"Время проверки: {now} \n" # print(out_str) response = requests.get(url) post_html = response.text post_tokens, post_uniq_text = preprocess_text(post_html) print(f"количество уникальных слов: {len(set(post_tokens))}") print() post_list = post_html.split("\n") # ищем другие лекции readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md") try: with open(readme_path, encoding="utf-8") as f: readme_html = f.read() except: with open(readme_path, encoding="cp1251") as f: readme_html = f.read() min_ratio = 100 min_ratio_file = "" readme_list = readme_html.split("\n") for readme_str in tqdm(readme_list): if '[' in readme_str: # print(f"проверяю {readme_str}") try: variant_name, t = readme_str.split("]") variant_name = variant_name.strip("[") t, variant_uri = readme_str.split("(") variant_uri = variant_uri.replace("),", "") variant_uri = variant_uri.replace(")", "") variant_uri = variant_uri.strip() variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri) try: with open(variant_path, encoding="utf-8") as f: variant_html = f.read() except: with open(variant_path, encoding="cp1251") as f: variant_html = f.read() variant_tokens, variant_uniq_text = preprocess_text(variant_html) # print(f"количество уникальных слов варианта: {len(set(variant_tokens))}") # пересечение множеств min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))]) c = list(set(post_tokens) & set(variant_tokens)) ratio = (1 - (len(c) / min_tokens_len)) * 100 if ratio < min_ratio: min_ratio = ratio min_ratio_file = variant_path # print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%") # print() except: print(f"Ошибка распаковки {readme_str}") print() print(f"min_ratio: {min_ratio}%") print(f"min_ratio_file: {min_ratio_file}") print("success")