u22grozdev
/
up
forked from ypv/up


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
							# взаимная проверка всех файлов в указанной папке
import os
from difflib import SequenceMatcher
from tqdm import tqdm
import datetime
import requests

# download stopwords corpus, you need to run it once
import nltk
#nltk.download("stopwords")
from nltk.corpus import stopwords
import pymorphy2
from string import punctuation

# from thefuzz import fuzz

# ------------------------------- НАСТРОЙКИ ------------
# директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
# проверяемая директория
#LECTION_DIR = os.path.join(BASE_DIR, "EASvZI", "2022-23", "Самостоятельная_работа_1")
LECTION_DIR = os.path.join(BASE_DIR, "TZI", "Лекции", "ПМ3.1")

# ------------------------------- / НАСТРОЙКИ ------------


def log(str: str = None):
    global out_str
    if str == None:
        print()
        out_str += "\n"
    else:
        print(str)
        out_str += f"{str}\n"


#Create lemmatizer and stopwords list
morph = pymorphy2.MorphAnalyzer()
russian_stopwords = stopwords.words("russian")

#Preprocess function
def preprocess_text(text):
    translator = str.maketrans(punctuation, ' '*len(punctuation))
    words = text.translate(translator)
    words = words.lower().split()
    
    # очистка от прилегающего к слову мусора (слово, "или так")
    clear_words = []
    for word in words:
        clear_word = ""
        for s in word:
            if not s in punctuation:
                clear_word = clear_word + s
        clear_words.append(clear_word)
    tokens = []
    tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
            and token != " " \
            and token.strip() not in punctuation \
            ]

    text = " ".join(tokens)    
    return tokens, text


out_str = ""
now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
log(f"Время проверки: {now}")


files_paths = []
files = os.listdir(LECTION_DIR)
for file in files:
    file_path = os.path.join(LECTION_DIR, file)
    filename, fileext = os.path.splitext(file)

    if os.path.isfile(file_path) and (fileext=='.md'):
        files_paths.append(file_path)


for file_1 in files_paths:
    for file_2 in files_paths:
        if (file_1 != file_2):
            small_filename_1 = str(file_1).replace(LECTION_DIR, "").strip("\\")
            small_filename_2 = str(file_2).replace(LECTION_DIR, "").strip("\\")
            try:
                with open(file_1, encoding="utf-8") as f_1:
                    str1 = f_1.read()
                    f_1.close()
            except:
                with open(file_1, encoding="cp1251") as f_1:
                    str1 = f_1.read()
                    f_1.close()

            try:
                with open(file_2, encoding="utf-8") as f_2:
                    str2 = f_2.read()
                    f_2.close()
            except:
                with open(file_2, encoding="cp1251") as f_2:
                    str2 = f_2.read()
                    f_2.close()


            str1_tokens, str1_uniq_text = preprocess_text(str1)
            str2_tokens, str2_uniq_text = preprocess_text(str2)

            # пересечение множеств 
            min_tokens_len = min([len(set(str1_tokens)), len(set(str2_tokens))])
            c = list(set(str1_tokens) & set(str2_tokens))
            ratio = (1 - (len(c) / min_tokens_len)) * 100
            log(f"уникальность {small_filename_1} / {small_filename_2}: {ratio:.2f}%")
    log()

with open(os.path.join(LECTION_DIR, "log.txt"), "w", encoding="utf-8") as f_log:
    f_log.write(out_str)
    f_log.close()