| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250 | import osfrom difflib import SequenceMatcherfrom tqdm import tqdmimport datetimeimport requests# download stopwords corpus, you need to run it onceimport nltk#nltk.download("stopwords")from nltk.corpus import stopwordsimport pymorphy2from string import punctuationfrom thefuzz import fuzz# ------------------------------- НАСТРОЙКИ ------------# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))# проверяемая директория# LECTION_DIR = os.path.join("ISRPO", "Лекции")# LECTION_DIR = os.path.join("EASvZI", "Лекции")LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")# ссылка для проверкиurl = "http://213.155.192.79:3001/kxdr/TZI/raw/8cae4615a64b92595cf8d1710c70f98bc56f32b7/2022-23/%d0%94%d0%b8%d1%84.%d0%b7%d0%b0%d1%87%d0%b5%d1%82_2%d1%81%d0%b5%d0%bc/sultan.md"# ------------------------------- / НАСТРОЙКИ ------------#Create lemmatizer and stopwords listmorph = pymorphy2.MorphAnalyzer()russian_stopwords = stopwords.words("russian")#Preprocess functiondef preprocess_text(text):    translator = str.maketrans(punctuation, ' '*len(punctuation))    words = text.translate(translator)    words = words.lower().split()        # очистка от прилегающего к слову мусора (слово, "или так")    clear_words = []    for word in words:        clear_word = ""        for s in word:            if not s in punctuation:                clear_word = clear_word + s        clear_words.append(clear_word)    tokens = []    tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\            and token != " " \            and token.strip() not in punctuation \            ]    text = " ".join(tokens)        return tokens, textprint()now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')out_str = f"Время проверки: {now} \n"# print(out_str)response = requests.get(url)post_html = response.textpost_list = post_html.split("\n")# проверяем правильность оформления 1й строкиheader_exist = Trueline_1 = post_list[0].strip()line_1 = line_1.replace(chr(65279), "")if (line_1[0:2]) != "# ":    print(f"Заголовок статьи не найден: '{line_1[0:1]} {line_1[1:2]}' вместо '# '")    print(f"{ord(line_1[0:1])} {ord(line_1[1:2])} вместо {ord('#')} {ord(' ')}")    header_exist = False# наличие вопросов и списка литературыquest_exist = Falsesource_exist = Falsefor post_line in post_list:    if (post_line[0:2] == "##"):        if ("Вопросы" in post_line):            quest_exist = True        if ("Список" in post_line) and ("литературы" in post_line):            source_exist = Trueif not (quest_exist):    print("Вопросы не найдены")if not (source_exist):    print("Список литературы не найден")header_text = line_1.replace("# ", "")header_text = header_text.replace(".", "")header_text = header_text.strip()# ищем другие лекции по этой темеreadme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")try:    with open(readme_path, encoding="utf-8") as f:        readme_html = f.read()except:    with open(readme_path, encoding="cp1251") as f:        readme_html = f.read()"""█    █    █████    ████████    █   ██   ██   ██    ██ █  █    ███████   ███████ █  █    ██   ██   ██  ██  ██     ██   ██   ██    ██"""lection_exist = Falsevariants_exist = Falsein_lections = False # начало поиска вариантовreadme_list = readme_html.split("\n")for readme_str in readme_list:    readme_str = readme_str.strip()    readme_str_list = readme_str.split(" ")    lection_number = readme_str_list[0]    readme_str_list.pop(0)    name_str = " ".join(readme_str_list)    name_str = name_str.replace(".", "")    name_str = name_str.strip()    if len(name_str)>0:        """        print(lection_number)        print(name_str)        print(header_text)        print(f"{ord(name_str[0:1])} {ord(name_str[1:2])} {ord(name_str[2:3])} вместо {ord(header_text[0:1])} {ord(header_text[1:2])} {ord(header_text[2:3])}")        print(fuzz.partial_ratio(name_str, header_text))        print()        """        if (str(name_str).lower() == str(header_text).lower()):            print("Лекция найдена в readme")            lection_exist = True            in_lections = True            post_tokens, post_uniq_text = preprocess_text(post_html)            print(f"количество уникальных слов: {len(set(post_tokens))}")            print()    # ищем конец списка вариантов лекций (пустая строка)    if lection_exist:        if (readme_str == ""):            in_lections = False    # следующие после названия лекции строки    if in_lections and (str(name_str).lower() != str(header_text).lower()):        variants_exist = True        variant_name, t = readme_str.split("]")        variant_name = variant_name.strip("[")        print(f"проверяю {variant_name}")        t, variant_uri = readme_str.split("(")        variant_uri = variant_uri.replace("),", "")        variant_uri = variant_uri.replace(")", "")        variant_uri = variant_uri.strip()                variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)        try:            with open(variant_path, encoding="utf-8") as f:                variant_html = f.read()        except:            with open(variant_path, encoding="cp1251") as f:                variant_html = f.read()        variant_tokens, variant_uniq_text = preprocess_text(variant_html)        print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")        # пересечение множеств         min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])        c = list(set(post_tokens) & set(variant_tokens))        ratio = (1 - (len(c) / min_tokens_len)) * 100        print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")        print()if not(lection_exist):    print("Лекция не найдена в readme")if not(variants_exist):    print("Вариантов не найдено")exit()files_paths = []dirs = os.listdir(BASE_DIR)for dir in dirs:    dir_path = os.path.join(BASE_DIR, dir)    if os.path.isdir(dir_path) and (dir != "__pycache__"):        files = os.listdir(dir_path)        for file in files:            file_path = os.path.join(BASE_DIR, dir, file)            filename, fileext = os.path.splitext(file)            if os.path.isfile(file_path) and (fileext=='.md'):                files_paths.append(file_path)out_str = ""max_ratio = 0max_ratio_file = ""for file_1 in tqdm(files_paths):    small_filename_1 = str(file_1).replace(BASE_DIR, "").strip("\\")    try:        with open(file_1, encoding="utf-8") as f_1:            str1 = f_1.read()    except:        with open(file_1, encoding="cp1251") as f_1:            str1 = f_1.read()            f_1.close()        with open(file_1, 'w', encoding="utf-8") as f_1:            f_1.write(str1)            f_1.close()                        ratio = int(SequenceMatcher(None, str1.lower(), post_html.lower()).ratio() * 100)    if (ratio > 70):        out_str += f"{small_filename_1}\n"        out_str += f"ratio = {ratio}\n"    if (ratio > max_ratio):        max_ratio = ratio        max_ratio_file = small_filename_1print(out_str)print()print(f"max ratio: {max_ratio}%")print(f"max ratio file: {max_ratio_file}")print("success")
 |