import os from difflib import SequenceMatcher from tqdm import tqdm import datetime import requests # download stopwords corpus, you need to run it once import nltk #nltk.download("stopwords") from nltk.corpus import stopwords import pymorphy2 from string import punctuation #Create lemmatizer and stopwords list morph = pymorphy2.MorphAnalyzer() russian_stopwords = stopwords.words("russian") #Preprocess function def preprocess_text(text): translator = str.maketrans(punctuation, ' '*len(punctuation)) words = text.translate(translator) words = words.lower().split() # очистка от прилегающего к слову мусора (слово, "или так") clear_words = [] for word in words: clear_word = "" for s in word: if not s in punctuation: clear_word = clear_word + s clear_words.append(clear_word) tokens = [] tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\ and token != " " \ and token.strip() not in punctuation \ ] text = " ".join(tokens) return tokens, text # директория файла BASE_DIR = os.path.abspath(os.path.dirname(__file__)) # ссылка для проверки url = "http://213.155.192.79:3001/ypv/up/raw/master/%d0%ad%d0%90%d0%a1%d0%b2%d0%97%d0%98/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.3.300_%d0%9a%d1%80%d0%b8%d1%82%d0%b5%d1%80%d0%b8%d0%b8_%d0%ba%d0%bb%d0%b0%d1%81%d1%81%d0%b8%d1%84%d0%b8%d0%ba%d0%b0%d1%86%d0%b8%d0%b8_%d1%83%d0%b3%d1%80%d0%be%d0%b7/Doc.md" who = "Савкин С." now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M') out_str = f"Проверка: {who}, время проверки: {now} \n" response = requests.get(url) post_html = response.text post_list = post_html.split("\n") # проверяем правильность оформления 1й строки line_1 = post_list[0] if (line_1[0]) != "#": out_str += "Заголовок статьи не найден\n" header_text = line_1.replace("# ", "") header_text = header_text.replace(".", "") header_text = header_text.strip() # ищем другие лекции по этой теме readme_path = os.path.join(BASE_DIR, "README.md") try: with open(readme_path, encoding="utf-8") as f: readme_html = f.read() except: with open(readme_path, encoding="cp1251") as f: readme_html = f.read() lection_exist = False readme_list = readme_html.split("\n") for readme_str in readme_list: readme_str = readme_str.strip() readme_str_list = readme_str.split(" ") readme_str_list.pop(0) name_str = " ".join(readme_str_list) name_str = name_str.replace(".", "") if (str(name_str) == str(header_text)): out_str += "Лекция найдена\n" lection_exist = True post_tokens, post_uniq_text = preprocess_text(post_html) out_str += f"количество уникальных слов: {len(set(post_tokens))}\n\n" # ищем конец списка вариантов лекций (пустая строка) if lection_exist: if (readme_str == ""): lection_exist = False # следующие после названия лекции строки if lection_exist and (str(name_str) != str(header_text)): variant_name, t = readme_str.split("]") variant_name = variant_name.strip("[") out_str += f"проверяю {variant_name}\n" t, variant_uri = readme_str.split("(") variant_uri = variant_uri.replace("),", "") variant_uri = variant_uri.strip() variant_path = os.path.join(BASE_DIR, variant_uri) try: with open(variant_path, encoding="utf-8") as f: variant_html = f.read() except: with open(variant_path, encoding="cp1251") as f: variant_html = f.read() variant_tokens, variant_uniq_text = preprocess_text(variant_html) out_str += f"количество уникальных слов варианта: {len(set(variant_tokens))}\n" # пересечение множеств c = list(set(post_tokens) & set(variant_tokens)) ratio = 1 - (len(c) / len(set(post_tokens))) out_str += f"количество совпадающих слов: {len(c)} / {ratio}%\n\n" if not lection_exist: out_str += "Лекция НЕ найдена\n" out_str +="\n\n" print(out_str) # запись лога log_path = os.path.join(BASE_DIR, "log.md") with open(log_path, "r", encoding="utf-8") as f_log: prev_str = f_log.read() prev_str = out_str + prev_str with open(log_path, "w", encoding="utf-8") as f_log: f_log.write(prev_str) f_log.close()