| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 | 
							- # взаимная проверка всех файлов в указанной папке
 
- import os
 
- from difflib import SequenceMatcher
 
- from tqdm import tqdm
 
- import datetime
 
- import requests
 
- # download stopwords corpus, you need to run it once
 
- import nltk
 
- #nltk.download("stopwords")
 
- from nltk.corpus import stopwords
 
- import pymorphy2
 
- from string import punctuation
 
- from thefuzz import fuzz
 
- # ------------------------------- НАСТРОЙКИ ------------
 
- # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
 
- BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
 
- # проверяемая директория
 
- LECTION_DIR = os.path.join(BASE_DIR, "EASvZI", "2022-23", "Самостоятельная_работа_1")
 
- # ------------------------------- / НАСТРОЙКИ ------------
 
- def log(str: str = None):
 
-     global out_str
 
-     if str == None:
 
-         print()
 
-         out_str += "\n"
 
-     else:
 
-         print(str)
 
-         out_str += f"{str}\n"
 
- #Create lemmatizer and stopwords list
 
- morph = pymorphy2.MorphAnalyzer()
 
- russian_stopwords = stopwords.words("russian")
 
- #Preprocess function
 
- def preprocess_text(text):
 
-     translator = str.maketrans(punctuation, ' '*len(punctuation))
 
-     words = text.translate(translator)
 
-     words = words.lower().split()
 
-     
 
-     # очистка от прилегающего к слову мусора (слово, "или так")
 
-     clear_words = []
 
-     for word in words:
 
-         clear_word = ""
 
-         for s in word:
 
-             if not s in punctuation:
 
-                 clear_word = clear_word + s
 
-         clear_words.append(clear_word)
 
-     tokens = []
 
-     tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
 
-             and token != " " \
 
-             and token.strip() not in punctuation \
 
-             ]
 
-     text = " ".join(tokens)    
 
-     return tokens, text
 
- out_str = ""
 
- now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
 
- log(f"Время проверки: {now}")
 
- files_paths = []
 
- files = os.listdir(LECTION_DIR)
 
- for file in files:
 
-     file_path = os.path.join(LECTION_DIR, file)
 
-     filename, fileext = os.path.splitext(file)
 
-     if os.path.isfile(file_path) and (fileext=='.md'):
 
-         files_paths.append(file_path)
 
- for file_1 in files_paths:
 
-     for file_2 in files_paths:
 
-         if (file_1 != file_2):
 
-             small_filename_1 = str(file_1).replace(LECTION_DIR, "").strip("\\")
 
-             small_filename_2 = str(file_2).replace(LECTION_DIR, "").strip("\\")
 
-             try:
 
-                 with open(file_1, encoding="utf-8") as f_1:
 
-                     str1 = f_1.read()
 
-                     f_1.close()
 
-             except:
 
-                 with open(file_1, encoding="cp1251") as f_1:
 
-                     str1 = f_1.read()
 
-                     f_1.close()
 
-             try:
 
-                 with open(file_2, encoding="utf-8") as f_2:
 
-                     str2 = f_2.read()
 
-                     f_2.close()
 
-             except:
 
-                 with open(file_2, encoding="cp1251") as f_2:
 
-                     str2 = f_2.read()
 
-                     f_2.close()
 
-             str1_tokens, str1_uniq_text = preprocess_text(str1)
 
-             str2_tokens, str2_uniq_text = preprocess_text(str2)
 
-             # пересечение множеств 
 
-             min_tokens_len = min([len(set(str1_tokens)), len(set(str2_tokens))])
 
-             c = list(set(str1_tokens) & set(str2_tokens))
 
-             ratio = (1 - (len(c) / min_tokens_len)) * 100
 
-             log(f"уникальность {small_filename_1} / {small_filename_2}: {ratio:.2f}%")
 
-     log()
 
- with open(os.path.join(LECTION_DIR, "log.txt"), "w", encoding="utf-8") as f_log:
 
-     f_log.write(out_str)
 
-     f_log.close()
 
 
  |