| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 | # взаимная проверка всех файлов в указанной папкеimport osfrom difflib import SequenceMatcherfrom tqdm import tqdmimport datetimeimport requests# download stopwords corpus, you need to run it onceimport nltk#nltk.download("stopwords")from nltk.corpus import stopwordsimport pymorphy2from string import punctuationfrom thefuzz import fuzz# ------------------------------- НАСТРОЙКИ ------------# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))# проверяемая директорияLECTION_DIR = os.path.join(BASE_DIR, "EASvZI", "2022-23", "Самостоятельная_работа_1")# ------------------------------- / НАСТРОЙКИ ------------def log(str: str = None):    global out_str    if str == None:        print()        out_str += "\n"    else:        print(str)        out_str += f"{str}\n"#Create lemmatizer and stopwords listmorph = pymorphy2.MorphAnalyzer()russian_stopwords = stopwords.words("russian")#Preprocess functiondef preprocess_text(text):    translator = str.maketrans(punctuation, ' '*len(punctuation))    words = text.translate(translator)    words = words.lower().split()        # очистка от прилегающего к слову мусора (слово, "или так")    clear_words = []    for word in words:        clear_word = ""        for s in word:            if not s in punctuation:                clear_word = clear_word + s        clear_words.append(clear_word)    tokens = []    tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\            and token != " " \            and token.strip() not in punctuation \            ]    text = " ".join(tokens)        return tokens, textout_str = ""now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')log(f"Время проверки: {now}")files_paths = []files = os.listdir(LECTION_DIR)for file in files:    file_path = os.path.join(LECTION_DIR, file)    filename, fileext = os.path.splitext(file)    if os.path.isfile(file_path) and (fileext=='.md'):        files_paths.append(file_path)for file_1 in files_paths:    for file_2 in files_paths:        if (file_1 != file_2):            small_filename_1 = str(file_1).replace(LECTION_DIR, "").strip("\\")            small_filename_2 = str(file_2).replace(LECTION_DIR, "").strip("\\")            try:                with open(file_1, encoding="utf-8") as f_1:                    str1 = f_1.read()                    f_1.close()            except:                with open(file_1, encoding="cp1251") as f_1:                    str1 = f_1.read()                    f_1.close()            try:                with open(file_2, encoding="utf-8") as f_2:                    str2 = f_2.read()                    f_2.close()            except:                with open(file_2, encoding="cp1251") as f_2:                    str2 = f_2.read()                    f_2.close()            str1_tokens, str1_uniq_text = preprocess_text(str1)            str2_tokens, str2_uniq_text = preprocess_text(str2)            # пересечение множеств             min_tokens_len = min([len(set(str1_tokens)), len(set(str2_tokens))])            c = list(set(str1_tokens) & set(str2_tokens))            ratio = (1 - (len(c) / min_tokens_len)) * 100            log(f"уникальность {small_filename_1} / {small_filename_2}: {ratio:.2f}%")    log()with open(os.path.join(LECTION_DIR, "log.txt"), "w", encoding="utf-8") as f_log:    f_log.write(out_str)    f_log.close()
 |