1
0

plagiat_full.v2.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. # взаимная проверка всех файлов в указанной папке
  2. import os
  3. from difflib import SequenceMatcher
  4. from tqdm import tqdm
  5. import datetime
  6. import requests
  7. # download stopwords corpus, you need to run it once
  8. import nltk
  9. #nltk.download("stopwords")
  10. from nltk.corpus import stopwords
  11. import pymorphy2
  12. from string import punctuation
  13. # from thefuzz import fuzz
  14. # ------------------------------- НАСТРОЙКИ ------------
  15. # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
  16. BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
  17. # проверяемая директория
  18. #LECTION_DIR = os.path.join(BASE_DIR, "EASvZI", "2022-23", "Самостоятельная_работа_1")
  19. LECTION_DIR = os.path.join(BASE_DIR, "TZI", "Лекции", "ПМ3.1")
  20. # ------------------------------- / НАСТРОЙКИ ------------
  21. def log(str: str = None):
  22. global out_str
  23. if str == None:
  24. print()
  25. out_str += "\n"
  26. else:
  27. print(str)
  28. out_str += f"{str}\n"
  29. #Create lemmatizer and stopwords list
  30. morph = pymorphy2.MorphAnalyzer()
  31. russian_stopwords = stopwords.words("russian")
  32. #Preprocess function
  33. def preprocess_text(text):
  34. translator = str.maketrans(punctuation, ' '*len(punctuation))
  35. words = text.translate(translator)
  36. words = words.lower().split()
  37. # очистка от прилегающего к слову мусора (слово, "или так")
  38. clear_words = []
  39. for word in words:
  40. clear_word = ""
  41. for s in word:
  42. if not s in punctuation:
  43. clear_word = clear_word + s
  44. clear_words.append(clear_word)
  45. tokens = []
  46. tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
  47. and token != " " \
  48. and token.strip() not in punctuation \
  49. ]
  50. text = " ".join(tokens)
  51. return tokens, text
  52. out_str = ""
  53. now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
  54. log(f"Время проверки: {now}")
  55. files_paths = []
  56. files = os.listdir(LECTION_DIR)
  57. for file in files:
  58. file_path = os.path.join(LECTION_DIR, file)
  59. filename, fileext = os.path.splitext(file)
  60. if os.path.isfile(file_path) and (fileext=='.md'):
  61. files_paths.append(file_path)
  62. for file_1 in files_paths:
  63. for file_2 in files_paths:
  64. if (file_1 != file_2):
  65. small_filename_1 = str(file_1).replace(LECTION_DIR, "").strip("\\")
  66. small_filename_2 = str(file_2).replace(LECTION_DIR, "").strip("\\")
  67. try:
  68. with open(file_1, encoding="utf-8") as f_1:
  69. str1 = f_1.read()
  70. f_1.close()
  71. except:
  72. with open(file_1, encoding="cp1251") as f_1:
  73. str1 = f_1.read()
  74. f_1.close()
  75. try:
  76. with open(file_2, encoding="utf-8") as f_2:
  77. str2 = f_2.read()
  78. f_2.close()
  79. except:
  80. with open(file_2, encoding="cp1251") as f_2:
  81. str2 = f_2.read()
  82. f_2.close()
  83. str1_tokens, str1_uniq_text = preprocess_text(str1)
  84. str2_tokens, str2_uniq_text = preprocess_text(str2)
  85. # пересечение множеств
  86. min_tokens_len = min([len(set(str1_tokens)), len(set(str2_tokens))])
  87. c = list(set(str1_tokens) & set(str2_tokens))
  88. ratio = (1 - (len(c) / min_tokens_len)) * 100
  89. log(f"уникальность {small_filename_1} / {small_filename_2}: {ratio:.2f}%")
  90. log()
  91. with open(os.path.join(LECTION_DIR, "log.txt"), "w", encoding="utf-8") as f_log:
  92. f_log.write(out_str)
  93. f_log.close()