123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- import os
- from difflib import SequenceMatcher
- from tqdm import tqdm
- import datetime
- import requests
- # download stopwords corpus, you need to run it once
- import nltk
- #nltk.download("stopwords")
- from nltk.corpus import stopwords
- import pymorphy2
- from string import punctuation
- # ------------------------------- НАСТРОЙКИ ------------
- # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
- BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
- # проверяемая директория
- # LECTION_DIR = os.path.join("ЭАСвЗИ", "Лекции")
- LECTION_DIR = os.path.join("ТЗИ", "Лекции", "ПМ3.2")
- # кого проверяем
- who = "Савкин"
- # ссылка для проверки
- url = "http://213.155.192.79:3001/ypv/up/src/master/%D0%A2%D0%97%D0%98/%D0%9B%D0%B5%D0%BA%D1%86%D0%B8%D0%B8/%D0%9F%D0%9C3.2/1.1.200_%D0%A1%D0%BE%D0%B4%D0%B5%D1%80%D0%B6%D0%B0%D0%BD%D0%B8%D0%B5_%D0%B8_%D0%B7%D0%B0%D0%B4%D0%B0%D1%87%D0%B8_%D1%84%D0%B8%D0%B7%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%BE%D0%B9_%D0%B7%D0%B0%D1%89%D0%B8%D1%82%D1%8B_%D0%BE%D0%B1%D1%8A%D0%B5%D0%BA%D1%82%D0%BE%D0%B2_%D0%B8%D0%BD%D1%84%D0%BE%D1%80%D0%BC%D0%B0%D1%82%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D0%B8/README.md"
- # ------------------------------- / НАСТРОЙКИ ------------
- #Create lemmatizer and stopwords list
- morph = pymorphy2.MorphAnalyzer()
- russian_stopwords = stopwords.words("russian")
- #Preprocess function
- def preprocess_text(text):
- translator = str.maketrans(punctuation, ' '*len(punctuation))
- words = text.translate(translator)
- words = words.lower().split()
-
- # очистка от прилегающего к слову мусора (слово, "или так")
- clear_words = []
- for word in words:
- clear_word = ""
- for s in word:
- if not s in punctuation:
- clear_word = clear_word + s
- clear_words.append(clear_word)
- tokens = []
- tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
- and token != " " \
- and token.strip() not in punctuation \
- ]
- text = " ".join(tokens)
- return tokens, text
- print()
- now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
- out_str = f"Проверка: {who}, время проверки: {now} \n"
- print(out_str)
- response = requests.get(url)
- post_html = response.text
- post_list = post_html.split("\n")
- # проверяем правильность оформления 1й строки
- line_1 = post_list[0]
- if (line_1[0]) != "#":
- print("Заголовок статьи не найден")
- # наличие вопросов и списка литературы
- quest_exist = False
- source_exist = False
- for post_line in post_list:
- if (post_line[0:1] == "##"):
- if ("Вопросы" in post_line):
- quest_exist = True
- if ("Список литературы" in post_line):
- source_exist = True
- if not (quest_exist):
- print("Вопросы не найдены")
- if not (source_exist):
- print("Список литературы не найден")
- header_text = line_1.replace("# ", "")
- header_text = header_text.replace(".", "")
- header_text = header_text.strip()
- # ищем другие лекции по этой теме
- readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
- try:
- with open(readme_path, encoding="utf-8") as f:
- readme_html = f.read()
- except:
- with open(readme_path, encoding="cp1251") as f:
- readme_html = f.read()
- lection_exist = False
- readme_list = readme_html.split("\n")
- for readme_str in readme_list:
- readme_str = readme_str.strip()
- readme_str_list = readme_str.split(" ")
- readme_str_list.pop(0)
- name_str = " ".join(readme_str_list)
- name_str = name_str.replace(".", "")
- if (str(name_str) == str(header_text)):
- print("Лекция найдена")
- lection_exist = True
- post_tokens, post_uniq_text = preprocess_text(post_html)
- print(f"количество уникальных слов: {len(set(post_tokens))}")
- print()
- # ищем конец списка вариантов лекций (пустая строка)
- if lection_exist:
- if (readme_str == ""):
- lection_exist = False
- # следующие после названия лекции строки
- if lection_exist and (str(name_str) != str(header_text)):
- variant_name, t = readme_str.split("]")
- variant_name = variant_name.strip("[")
- print(f"проверяю {variant_name}")
- t, variant_uri = readme_str.split("(")
- variant_uri = variant_uri.replace("),", "")
- variant_uri = variant_uri.strip()
-
- variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
- try:
- with open(variant_path, encoding="utf-8") as f:
- variant_html = f.read()
- except:
- with open(variant_path, encoding="cp1251") as f:
- variant_html = f.read()
- variant_tokens, variant_uniq_text = preprocess_text(variant_html)
- print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
- # пересечение множеств
- c = list(set(post_tokens) & set(variant_tokens))
- ratio = 1 - (len(c) / len(set(post_tokens)))
- print(f"количество совпадающих слов: {len(c)} / {ratio}%")
- print()
- if not(lection_exist):
- print("Лекция не найдена в списке")
- exit()
- files_paths = []
- dirs = os.listdir(BASE_DIR)
- for dir in dirs:
- dir_path = os.path.join(BASE_DIR, dir)
- if os.path.isdir(dir_path) and (dir != "__pycache__"):
- files = os.listdir(dir_path)
- for file in files:
- file_path = os.path.join(BASE_DIR, dir, file)
- filename, fileext = os.path.splitext(file)
- if os.path.isfile(file_path) and (fileext=='.md'):
- files_paths.append(file_path)
- out_str = ""
- max_ratio = 0
- max_ratio_file = ""
- for file_1 in tqdm(files_paths):
- small_filename_1 = str(file_1).replace(BASE_DIR, "").strip("\\")
- try:
- with open(file_1, encoding="utf-8") as f_1:
- str1 = f_1.read()
- except:
- with open(file_1, encoding="cp1251") as f_1:
- str1 = f_1.read()
- f_1.close()
- with open(file_1, 'w', encoding="utf-8") as f_1:
- f_1.write(str1)
- f_1.close()
-
- ratio = int(SequenceMatcher(None, str1.lower(), post_html.lower()).ratio() * 100)
- if (ratio > 70):
- out_str += f"{small_filename_1}\n"
- out_str += f"ratio = {ratio}\n"
- if (ratio > max_ratio):
- max_ratio = ratio
- max_ratio_file = small_filename_1
- print(out_str)
- print()
- print(f"max ratio: {max_ratio}%")
- print(f"max ratio file: {max_ratio_file}")
- print("success")
|