# взаимная проверка всех файлов в указанной папке import os from difflib import SequenceMatcher from tqdm import tqdm import datetime import requests # download stopwords corpus, you need to run it once import nltk #nltk.download("stopwords") from nltk.corpus import stopwords import pymorphy2 from string import punctuation # from thefuzz import fuzz # ------------------------------- НАСТРОЙКИ ------------ # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23) BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) # проверяемая директория #LECTION_DIR = os.path.join(BASE_DIR, "EASvZI", "2022-23", "Самостоятельная_работа_1") LECTION_DIR = os.path.join(BASE_DIR, "TZI", "Лекции", "ПМ3.1") # ------------------------------- / НАСТРОЙКИ ------------ def log(str: str = None): global out_str if str == None: print() out_str += "\n" else: print(str) out_str += f"{str}\n" #Create lemmatizer and stopwords list morph = pymorphy2.MorphAnalyzer() russian_stopwords = stopwords.words("russian") #Preprocess function def preprocess_text(text): translator = str.maketrans(punctuation, ' '*len(punctuation)) words = text.translate(translator) words = words.lower().split() # очистка от прилегающего к слову мусора (слово, "или так") clear_words = [] for word in words: clear_word = "" for s in word: if not s in punctuation: clear_word = clear_word + s clear_words.append(clear_word) tokens = [] tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\ and token != " " \ and token.strip() not in punctuation \ ] text = " ".join(tokens) return tokens, text out_str = "" now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M') log(f"Время проверки: {now}") files_paths = [] files = os.listdir(LECTION_DIR) for file in files: file_path = os.path.join(LECTION_DIR, file) filename, fileext = os.path.splitext(file) if os.path.isfile(file_path) and (fileext=='.md'): files_paths.append(file_path) for file_1 in files_paths: for file_2 in files_paths: if (file_1 != file_2): small_filename_1 = str(file_1).replace(LECTION_DIR, "").strip("\\") small_filename_2 = str(file_2).replace(LECTION_DIR, "").strip("\\") try: with open(file_1, encoding="utf-8") as f_1: str1 = f_1.read() f_1.close() except: with open(file_1, encoding="cp1251") as f_1: str1 = f_1.read() f_1.close() try: with open(file_2, encoding="utf-8") as f_2: str2 = f_2.read() f_2.close() except: with open(file_2, encoding="cp1251") as f_2: str2 = f_2.read() f_2.close() str1_tokens, str1_uniq_text = preprocess_text(str1) str2_tokens, str2_uniq_text = preprocess_text(str2) # пересечение множеств min_tokens_len = min([len(set(str1_tokens)), len(set(str2_tokens))]) c = list(set(str1_tokens) & set(str2_tokens)) ratio = (1 - (len(c) / min_tokens_len)) * 100 log(f"уникальность {small_filename_1} / {small_filename_2}: {ratio:.2f}%") log() with open(os.path.join(LECTION_DIR, "log.txt"), "w", encoding="utf-8") as f_log: f_log.write(out_str) f_log.close()