|
@@ -4,6 +4,42 @@ from tqdm import tqdm
|
|
|
import datetime
|
|
|
import requests
|
|
|
|
|
|
+# download stopwords corpus, you need to run it once
|
|
|
+import nltk
|
|
|
+#nltk.download("stopwords")
|
|
|
+from nltk.corpus import stopwords
|
|
|
+import pymorphy2
|
|
|
+from string import punctuation
|
|
|
+
|
|
|
+#Create lemmatizer and stopwords list
|
|
|
+morph = pymorphy2.MorphAnalyzer()
|
|
|
+russian_stopwords = stopwords.words("russian")
|
|
|
+
|
|
|
+#Preprocess function
|
|
|
+def preprocess_text(text):
|
|
|
+ translator = str.maketrans(punctuation, ' '*len(punctuation))
|
|
|
+ words = text.translate(translator)
|
|
|
+ words = words.lower().split()
|
|
|
+
|
|
|
+ # очистка от прилегающего к слову мусора (слово, "или так")
|
|
|
+ clear_words = []
|
|
|
+ for word in words:
|
|
|
+ clear_word = ""
|
|
|
+ for s in word:
|
|
|
+ if not s in punctuation:
|
|
|
+ clear_word = clear_word + s
|
|
|
+ clear_words.append(clear_word)
|
|
|
+ tokens = []
|
|
|
+ tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
|
|
|
+ and token != " " \
|
|
|
+ and token.strip() not in punctuation \
|
|
|
+ ]
|
|
|
+
|
|
|
+ text = " ".join(tokens)
|
|
|
+ return tokens, text
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
# директория файла
|
|
|
BASE_DIR = os.path.abspath(os.path.dirname(__file__))
|
|
|
|
|
@@ -17,7 +53,77 @@ response = requests.get(url)
|
|
|
post_html = response.text
|
|
|
|
|
|
post_list = post_html.split("\n")
|
|
|
-print(post_list[0])
|
|
|
+
|
|
|
+# проверяем правильность оформления 1й строки
|
|
|
+line_1 = post_list[0]
|
|
|
+if (line_1[0]) != "#":
|
|
|
+ print("Заголовок статьи не найден")
|
|
|
+
|
|
|
+header_text = line_1.replace("# ", "")
|
|
|
+header_text = header_text.replace(".", "")
|
|
|
+header_text = header_text.strip()
|
|
|
+
|
|
|
+# ищем другие лекции по этой теме
|
|
|
+readme_path = os.path.join(BASE_DIR, "README.md")
|
|
|
+try:
|
|
|
+ with open(readme_path, encoding="utf-8") as f:
|
|
|
+ readme_html = f.read()
|
|
|
+except:
|
|
|
+ with open(readme_path, encoding="cp1251") as f:
|
|
|
+ readme_html = f.read()
|
|
|
+
|
|
|
+lection_exist = False
|
|
|
+readme_list = readme_html.split("\n")
|
|
|
+for readme_str in readme_list:
|
|
|
+ readme_str = readme_str.strip()
|
|
|
+ readme_str_list = readme_str.split(" ")
|
|
|
+ readme_str_list.pop(0)
|
|
|
+ name_str = " ".join(readme_str_list)
|
|
|
+ name_str = name_str.replace(".", "")
|
|
|
+
|
|
|
+ if (str(name_str) == str(header_text)):
|
|
|
+ print("Лекция найдена")
|
|
|
+ lection_exist = True
|
|
|
+
|
|
|
+ post_tokens, post_uniq_text = preprocess_text(post_html)
|
|
|
+ print(f"количество уникальных слов: {len(set(post_tokens))}")
|
|
|
+ print()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ # ищем конец списка вариантов лекций (пустая строка)
|
|
|
+ if lection_exist:
|
|
|
+ if (readme_str == ""):
|
|
|
+ lection_exist = False
|
|
|
+
|
|
|
+ # следующие после названия лекции строки
|
|
|
+ if lection_exist and (str(name_str) != str(header_text)):
|
|
|
+ variant_name, t = readme_str.split("]")
|
|
|
+ variant_name = variant_name.strip("[")
|
|
|
+ print(f"проверяю {variant_name}")
|
|
|
+ t, variant_uri = readme_str.split("(")
|
|
|
+ variant_uri = variant_uri.replace("),", "")
|
|
|
+ variant_uri = variant_uri.strip()
|
|
|
+
|
|
|
+ variant_path = os.path.join(BASE_DIR, variant_uri)
|
|
|
+ try:
|
|
|
+ with open(variant_path, encoding="utf-8") as f:
|
|
|
+ variant_html = f.read()
|
|
|
+ except:
|
|
|
+ with open(variant_path, encoding="cp1251") as f:
|
|
|
+ variant_html = f.read()
|
|
|
+
|
|
|
+ variant_tokens, variant_uniq_text = preprocess_text(variant_html)
|
|
|
+ print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
|
|
|
+
|
|
|
+ # пересечение множеств
|
|
|
+ c = list(set(post_tokens) & set(variant_tokens))
|
|
|
+ ratio = 1 - (len(c) / len(set(post_tokens)))
|
|
|
+ print(f"количество совпадающих слов: {len(c)} / {ratio}%")
|
|
|
+ print()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
exit()
|
|
|
files_paths = []
|