plagiat_1.v2.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. import os
  2. from difflib import SequenceMatcher
  3. from tqdm import tqdm
  4. import datetime
  5. import requests
  6. # download stopwords corpus, you need to run it once
  7. import nltk
  8. #nltk.download("stopwords")
  9. from nltk.corpus import stopwords
  10. import pymorphy2
  11. from string import punctuation
  12. # ------------------------------- НАСТРОЙКИ ------------
  13. # директория файла
  14. BASE_DIR = os.path.abspath(os.path.dirname(__file__))
  15. # проверяемая директория
  16. LECTION_DIR = os.path.join("ЭАСвЗИ", "Лекции")
  17. # кого проверяем
  18. who = "Савкин"
  19. # ссылка для проверки
  20. url = "http://213.155.192.79:3001/ypv/up/raw/master/%d0%ad%d0%90%d0%a1%d0%b2%d0%97%d0%98/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.3.300_%d0%9a%d1%80%d0%b8%d1%82%d0%b5%d1%80%d0%b8%d0%b8_%d0%ba%d0%bb%d0%b0%d1%81%d1%81%d0%b8%d1%84%d0%b8%d0%ba%d0%b0%d1%86%d0%b8%d0%b8_%d1%83%d0%b3%d1%80%d0%be%d0%b7/Doc.md"
  21. # ------------------------------- / НАСТРОЙКИ ------------
  22. #Create lemmatizer and stopwords list
  23. morph = pymorphy2.MorphAnalyzer()
  24. russian_stopwords = stopwords.words("russian")
  25. #Preprocess function
  26. def preprocess_text(text):
  27. translator = str.maketrans(punctuation, ' '*len(punctuation))
  28. words = text.translate(translator)
  29. words = words.lower().split()
  30. # очистка от прилегающего к слову мусора (слово, "или так")
  31. clear_words = []
  32. for word in words:
  33. clear_word = ""
  34. for s in word:
  35. if not s in punctuation:
  36. clear_word = clear_word + s
  37. clear_words.append(clear_word)
  38. tokens = []
  39. tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
  40. and token != " " \
  41. and token.strip() not in punctuation \
  42. ]
  43. text = " ".join(tokens)
  44. return tokens, text
  45. print()
  46. now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
  47. out_str = f"Проверка: {who}, время проверки: {now} \n"
  48. print(out_str)
  49. response = requests.get(url)
  50. post_html = response.text
  51. post_list = post_html.split("\n")
  52. # проверяем правильность оформления 1й строки
  53. line_1 = post_list[0]
  54. if (line_1[0]) != "#":
  55. print("Заголовок статьи не найден")
  56. header_text = line_1.replace("# ", "")
  57. header_text = header_text.replace(".", "")
  58. header_text = header_text.strip()
  59. # ищем другие лекции по этой теме
  60. readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
  61. try:
  62. with open(readme_path, encoding="utf-8") as f:
  63. readme_html = f.read()
  64. except:
  65. with open(readme_path, encoding="cp1251") as f:
  66. readme_html = f.read()
  67. lection_exist = False
  68. readme_list = readme_html.split("\n")
  69. for readme_str in readme_list:
  70. readme_str = readme_str.strip()
  71. readme_str_list = readme_str.split(" ")
  72. readme_str_list.pop(0)
  73. name_str = " ".join(readme_str_list)
  74. name_str = name_str.replace(".", "")
  75. if (str(name_str) == str(header_text)):
  76. print("Лекция найдена")
  77. lection_exist = True
  78. post_tokens, post_uniq_text = preprocess_text(post_html)
  79. print(f"количество уникальных слов: {len(set(post_tokens))}")
  80. print()
  81. # ищем конец списка вариантов лекций (пустая строка)
  82. if lection_exist:
  83. if (readme_str == ""):
  84. lection_exist = False
  85. # следующие после названия лекции строки
  86. if lection_exist and (str(name_str) != str(header_text)):
  87. variant_name, t = readme_str.split("]")
  88. variant_name = variant_name.strip("[")
  89. print(f"проверяю {variant_name}")
  90. t, variant_uri = readme_str.split("(")
  91. variant_uri = variant_uri.replace("),", "")
  92. variant_uri = variant_uri.strip()
  93. variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
  94. try:
  95. with open(variant_path, encoding="utf-8") as f:
  96. variant_html = f.read()
  97. except:
  98. with open(variant_path, encoding="cp1251") as f:
  99. variant_html = f.read()
  100. variant_tokens, variant_uniq_text = preprocess_text(variant_html)
  101. print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
  102. # пересечение множеств
  103. c = list(set(post_tokens) & set(variant_tokens))
  104. ratio = 1 - (len(c) / len(set(post_tokens)))
  105. print(f"количество совпадающих слов: {len(c)} / {ratio}%")
  106. print()
  107. exit()
  108. files_paths = []
  109. dirs = os.listdir(BASE_DIR)
  110. for dir in dirs:
  111. dir_path = os.path.join(BASE_DIR, dir)
  112. if os.path.isdir(dir_path) and (dir != "__pycache__"):
  113. files = os.listdir(dir_path)
  114. for file in files:
  115. file_path = os.path.join(BASE_DIR, dir, file)
  116. filename, fileext = os.path.splitext(file)
  117. if os.path.isfile(file_path) and (fileext=='.md'):
  118. files_paths.append(file_path)
  119. out_str = ""
  120. max_ratio = 0
  121. max_ratio_file = ""
  122. for file_1 in tqdm(files_paths):
  123. small_filename_1 = str(file_1).replace(BASE_DIR, "").strip("\\")
  124. try:
  125. with open(file_1, encoding="utf-8") as f_1:
  126. str1 = f_1.read()
  127. except:
  128. with open(file_1, encoding="cp1251") as f_1:
  129. str1 = f_1.read()
  130. f_1.close()
  131. with open(file_1, 'w', encoding="utf-8") as f_1:
  132. f_1.write(str1)
  133. f_1.close()
  134. ratio = int(SequenceMatcher(None, str1.lower(), post_html.lower()).ratio() * 100)
  135. if (ratio > 70):
  136. out_str += f"{small_filename_1}\n"
  137. out_str += f"ratio = {ratio}\n"
  138. if (ratio > max_ratio):
  139. max_ratio = ratio
  140. max_ratio_file = small_filename_1
  141. print(out_str)
  142. print()
  143. print(f"max ratio: {max_ratio}%")
  144. print(f"max ratio file: {max_ratio_file}")
  145. print("success")