import os from difflib import SequenceMatcher from tqdm import tqdm import datetime import requests # download stopwords corpus, you need to run it once import nltk #nltk.download("stopwords") from nltk.corpus import stopwords import pymorphy2 from string import punctuation # ------------------------------- НАСТРОЙКИ ------------ # директория файла BASE_DIR = os.path.abspath(os.path.dirname(__file__)) # проверяемая директория # LECTION_DIR = os.path.join("ЭАСвЗИ", "Лекции") LECTION_DIR = os.path.join("ТЗИ", "Лекции", "ПМ3.2") # кого проверяем who = "Савкин" # ссылка для проверки url = "http://213.155.192.79:3001/ypv/up/src/master/%D0%A2%D0%97%D0%98/%D0%9B%D0%B5%D0%BA%D1%86%D0%B8%D0%B8/%D0%9F%D0%9C3.2/1.1.200_%D0%A1%D0%BE%D0%B4%D0%B5%D1%80%D0%B6%D0%B0%D0%BD%D0%B8%D0%B5_%D0%B8_%D0%B7%D0%B0%D0%B4%D0%B0%D1%87%D0%B8_%D1%84%D0%B8%D0%B7%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%BE%D0%B9_%D0%B7%D0%B0%D1%89%D0%B8%D1%82%D1%8B_%D0%BE%D0%B1%D1%8A%D0%B5%D0%BA%D1%82%D0%BE%D0%B2_%D0%B8%D0%BD%D1%84%D0%BE%D1%80%D0%BC%D0%B0%D1%82%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D0%B8/README.md" # ------------------------------- / НАСТРОЙКИ ------------ #Create lemmatizer and stopwords list morph = pymorphy2.MorphAnalyzer() russian_stopwords = stopwords.words("russian") #Preprocess function def preprocess_text(text): translator = str.maketrans(punctuation, ' '*len(punctuation)) words = text.translate(translator) words = words.lower().split() # очистка от прилегающего к слову мусора (слово, "или так") clear_words = [] for word in words: clear_word = "" for s in word: if not s in punctuation: clear_word = clear_word + s clear_words.append(clear_word) tokens = [] tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\ and token != " " \ and token.strip() not in punctuation \ ] text = " ".join(tokens) return tokens, text print() now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M') out_str = f"Проверка: {who}, время проверки: {now} \n" print(out_str) response = requests.get(url) post_html = response.text post_list = post_html.split("\n") # проверяем правильность оформления 1й строки line_1 = post_list[0] if (line_1[0]) != "#": print("Заголовок статьи не найден") header_text = line_1.replace("# ", "") header_text = header_text.replace(".", "") header_text = header_text.strip() # ищем другие лекции по этой теме readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md") try: with open(readme_path, encoding="utf-8") as f: readme_html = f.read() except: with open(readme_path, encoding="cp1251") as f: readme_html = f.read() lection_exist = False readme_list = readme_html.split("\n") for readme_str in readme_list: readme_str = readme_str.strip() readme_str_list = readme_str.split(" ") readme_str_list.pop(0) name_str = " ".join(readme_str_list) name_str = name_str.replace(".", "") if (str(name_str) == str(header_text)): print("Лекция найдена") lection_exist = True post_tokens, post_uniq_text = preprocess_text(post_html) print(f"количество уникальных слов: {len(set(post_tokens))}") print() # ищем конец списка вариантов лекций (пустая строка) if lection_exist: if (readme_str == ""): lection_exist = False # следующие после названия лекции строки if lection_exist and (str(name_str) != str(header_text)): variant_name, t = readme_str.split("]") variant_name = variant_name.strip("[") print(f"проверяю {variant_name}") t, variant_uri = readme_str.split("(") variant_uri = variant_uri.replace("),", "") variant_uri = variant_uri.strip() variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri) try: with open(variant_path, encoding="utf-8") as f: variant_html = f.read() except: with open(variant_path, encoding="cp1251") as f: variant_html = f.read() variant_tokens, variant_uniq_text = preprocess_text(variant_html) print(f"количество уникальных слов варианта: {len(set(variant_tokens))}") # пересечение множеств c = list(set(post_tokens) & set(variant_tokens)) ratio = 1 - (len(c) / len(set(post_tokens))) print(f"количество совпадающих слов: {len(c)} / {ratio}%") print() exit() files_paths = [] dirs = os.listdir(BASE_DIR) for dir in dirs: dir_path = os.path.join(BASE_DIR, dir) if os.path.isdir(dir_path) and (dir != "__pycache__"): files = os.listdir(dir_path) for file in files: file_path = os.path.join(BASE_DIR, dir, file) filename, fileext = os.path.splitext(file) if os.path.isfile(file_path) and (fileext=='.md'): files_paths.append(file_path) out_str = "" max_ratio = 0 max_ratio_file = "" for file_1 in tqdm(files_paths): small_filename_1 = str(file_1).replace(BASE_DIR, "").strip("\\") try: with open(file_1, encoding="utf-8") as f_1: str1 = f_1.read() except: with open(file_1, encoding="cp1251") as f_1: str1 = f_1.read() f_1.close() with open(file_1, 'w', encoding="utf-8") as f_1: f_1.write(str1) f_1.close() ratio = int(SequenceMatcher(None, str1.lower(), post_html.lower()).ratio() * 100) if (ratio > 70): out_str += f"{small_filename_1}\n" out_str += f"ratio = {ratio}\n" if (ratio > max_ratio): max_ratio = ratio max_ratio_file = small_filename_1 print(out_str) print() print(f"max ratio: {max_ratio}%") print(f"max ratio file: {max_ratio_file}") print("success")