|
@@ -1,142 +0,0 @@
|
|
|
-import os
|
|
|
-from difflib import SequenceMatcher
|
|
|
-from tqdm import tqdm
|
|
|
-import datetime
|
|
|
-import requests
|
|
|
-
|
|
|
-# download stopwords corpus, you need to run it once
|
|
|
-import nltk
|
|
|
-#nltk.download("stopwords")
|
|
|
-from nltk.corpus import stopwords
|
|
|
-import pymorphy2
|
|
|
-from string import punctuation
|
|
|
-
|
|
|
-#Create lemmatizer and stopwords list
|
|
|
-morph = pymorphy2.MorphAnalyzer()
|
|
|
-russian_stopwords = stopwords.words("russian")
|
|
|
-
|
|
|
-#Preprocess function
|
|
|
-def preprocess_text(text):
|
|
|
- translator = str.maketrans(punctuation, ' '*len(punctuation))
|
|
|
- words = text.translate(translator)
|
|
|
- words = words.lower().split()
|
|
|
-
|
|
|
- # очистка от прилегающего к слову мусора (слово, "или так")
|
|
|
- clear_words = []
|
|
|
- for word in words:
|
|
|
- clear_word = ""
|
|
|
- for s in word:
|
|
|
- if not s in punctuation:
|
|
|
- clear_word = clear_word + s
|
|
|
- clear_words.append(clear_word)
|
|
|
- tokens = []
|
|
|
- tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
|
|
|
- and token != " " \
|
|
|
- and token.strip() not in punctuation \
|
|
|
- ]
|
|
|
-
|
|
|
- text = " ".join(tokens)
|
|
|
- return tokens, text
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-# директория файла
|
|
|
-BASE_DIR = os.path.abspath(os.path.dirname(__file__))
|
|
|
-
|
|
|
-# ссылка для проверки
|
|
|
-url = "http://213.155.192.79:3001/ypv/up/raw/master/%d0%ad%d0%90%d0%a1%d0%b2%d0%97%d0%98/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.3.300_%d0%9a%d1%80%d0%b8%d1%82%d0%b5%d1%80%d0%b8%d0%b8_%d0%ba%d0%bb%d0%b0%d1%81%d1%81%d0%b8%d1%84%d0%b8%d0%ba%d0%b0%d1%86%d0%b8%d0%b8_%d1%83%d0%b3%d1%80%d0%be%d0%b7/Doc.md"
|
|
|
-who = "Савкин С."
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
|
|
|
-out_str = f"Проверка: {who}, время проверки: {now} \n"
|
|
|
-
|
|
|
-response = requests.get(url)
|
|
|
-post_html = response.text
|
|
|
-
|
|
|
-post_list = post_html.split("\n")
|
|
|
-
|
|
|
-# проверяем правильность оформления 1й строки
|
|
|
-line_1 = post_list[0]
|
|
|
-if (line_1[0]) != "#":
|
|
|
- out_str += "Заголовок статьи не найден\n"
|
|
|
-
|
|
|
-header_text = line_1.replace("# ", "")
|
|
|
-header_text = header_text.replace(".", "")
|
|
|
-header_text = header_text.strip()
|
|
|
-
|
|
|
-# ищем другие лекции по этой теме
|
|
|
-readme_path = os.path.join(BASE_DIR, "README.md")
|
|
|
-try:
|
|
|
- with open(readme_path, encoding="utf-8") as f:
|
|
|
- readme_html = f.read()
|
|
|
-except:
|
|
|
- with open(readme_path, encoding="cp1251") as f:
|
|
|
- readme_html = f.read()
|
|
|
-
|
|
|
-lection_exist = False
|
|
|
-readme_list = readme_html.split("\n")
|
|
|
-for readme_str in readme_list:
|
|
|
- readme_str = readme_str.strip()
|
|
|
- readme_str_list = readme_str.split(" ")
|
|
|
- readme_str_list.pop(0)
|
|
|
- name_str = " ".join(readme_str_list)
|
|
|
- name_str = name_str.replace(".", "")
|
|
|
-
|
|
|
- if (str(name_str) == str(header_text)):
|
|
|
- out_str += "Лекция найдена\n"
|
|
|
- lection_exist = True
|
|
|
-
|
|
|
- post_tokens, post_uniq_text = preprocess_text(post_html)
|
|
|
- out_str += f"количество уникальных слов: {len(set(post_tokens))}\n\n"
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- # ищем конец списка вариантов лекций (пустая строка)
|
|
|
- if lection_exist:
|
|
|
- if (readme_str == ""):
|
|
|
- lection_exist = False
|
|
|
-
|
|
|
- # следующие после названия лекции строки
|
|
|
- if lection_exist and (str(name_str) != str(header_text)):
|
|
|
- variant_name, t = readme_str.split("]")
|
|
|
- variant_name = variant_name.strip("[")
|
|
|
- out_str += f"проверяю {variant_name}\n"
|
|
|
- t, variant_uri = readme_str.split("(")
|
|
|
- variant_uri = variant_uri.replace("),", "")
|
|
|
- variant_uri = variant_uri.strip()
|
|
|
-
|
|
|
- variant_path = os.path.join(BASE_DIR, variant_uri)
|
|
|
- try:
|
|
|
- with open(variant_path, encoding="utf-8") as f:
|
|
|
- variant_html = f.read()
|
|
|
- except:
|
|
|
- with open(variant_path, encoding="cp1251") as f:
|
|
|
- variant_html = f.read()
|
|
|
-
|
|
|
- variant_tokens, variant_uniq_text = preprocess_text(variant_html)
|
|
|
- out_str += f"количество уникальных слов варианта: {len(set(variant_tokens))}\n"
|
|
|
-
|
|
|
- # пересечение множеств
|
|
|
- c = list(set(post_tokens) & set(variant_tokens))
|
|
|
- ratio = 1 - (len(c) / len(set(post_tokens)))
|
|
|
- out_str += f"количество совпадающих слов: {len(c)} / {ratio}%\n\n"
|
|
|
-
|
|
|
-if not lection_exist:
|
|
|
- out_str += "Лекция НЕ найдена\n"
|
|
|
-
|
|
|
-out_str +="\n\n"
|
|
|
-print(out_str)
|
|
|
-
|
|
|
-# запись лога
|
|
|
-log_path = os.path.join(BASE_DIR, "log.md")
|
|
|
-with open(log_path, "r", encoding="utf-8") as f_log:
|
|
|
- prev_str = f_log.read()
|
|
|
-
|
|
|
-prev_str = out_str + prev_str
|
|
|
-with open(log_path, "w", encoding="utf-8") as f_log:
|
|
|
- f_log.write(prev_str)
|
|
|
- f_log.close()
|
|
|
-
|
|
|
-
|