plagiat_1.v2.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. import os
  2. from difflib import SequenceMatcher
  3. from tqdm import tqdm
  4. import datetime
  5. import requests
  6. # download stopwords corpus, you need to run it once
  7. import nltk
  8. #nltk.download("stopwords")
  9. from nltk.corpus import stopwords
  10. import pymorphy3
  11. from string import punctuation
  12. # ------------------------------- НАСТРОЙКИ ------------
  13. # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
  14. BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
  15. # проверяемая директория
  16. LECTION_DIR = os.path.join("EASvZI", "Лекции")
  17. # ссылка для проверки
  18. url = "http://213.155.192.79:3001/u20-24zurov/EASvZI/raw/c3265c710cc842b0c2c670a84a0b6a435d708e5e/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.4.120_%d0%9f%d1%80%d0%b0%d0%b2%d0%be%d0%b2%d1%8b%d0%b5_%d0%bc%d0%b5%d1%80%d1%8b_%d0%97%d0%98_%d0%b2_%d0%90%d0%a1/Zurov.md"
  19. # ------------------------------- / НАСТРОЙКИ ------------
  20. #Create lemmatizer and stopwords list
  21. morph = pymorphy3.MorphAnalyzer()
  22. russian_stopwords = stopwords.words("russian")
  23. #Preprocess function
  24. def preprocess_text(text):
  25. translator = str.maketrans(punctuation, ' '*len(punctuation))
  26. words = text.translate(translator)
  27. words = words.lower().split()
  28. # очистка от прилегающего к слову мусора (слово, "или так")
  29. clear_words = []
  30. for word in words:
  31. clear_word = ""
  32. for s in word:
  33. if not s in punctuation:
  34. clear_word = clear_word + s
  35. clear_words.append(clear_word)
  36. tokens = []
  37. tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
  38. and token != " " \
  39. and token.strip() not in punctuation \
  40. ]
  41. text = " ".join(tokens)
  42. return tokens, text
  43. print()
  44. now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
  45. out_str = f"Время проверки: {now} \n"
  46. # print(out_str)
  47. response = requests.get(url)
  48. post_html = response.text
  49. post_list = post_html.split("\n")
  50. # проверяем правильность оформления 1й строки
  51. header_exist = True
  52. line_1 = post_list[0].strip()
  53. line_1 = line_1.replace(chr(65279), "")
  54. if (line_1[0:2]) != "# ":
  55. print(f"Заголовок статьи не найден: '{line_1[0:1]} {line_1[1:2]}' вместо '# '")
  56. print(f"{ord(line_1[0:1])} {ord(line_1[1:2])} вместо {ord('#')} {ord(' ')}")
  57. header_exist = False
  58. # наличие вопросов и списка литературы
  59. quest_exist = False
  60. source_exist = False
  61. for post_line in post_list:
  62. if (post_line[0:2] == "##"):
  63. if ("Вопросы" in post_line):
  64. quest_exist = True
  65. if ("Список" in post_line) and ("литературы" in post_line):
  66. source_exist = True
  67. if not (quest_exist):
  68. print("Вопросы не найдены")
  69. if not (source_exist):
  70. print("Список литературы не найден")
  71. header_text = line_1.replace("# ", "")
  72. header_text = header_text.replace(".", "")
  73. header_text = header_text.strip()
  74. # ищем другие лекции по этой теме
  75. readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
  76. try:
  77. with open(readme_path, encoding="utf-8") as f:
  78. readme_html = f.read()
  79. except:
  80. with open(readme_path, encoding="cp1251") as f:
  81. readme_html = f.read()
  82. """
  83. █ █ █████ ███████
  84. █ █ ██ ██ ██ ██
  85. █ █ ███████ ███████
  86. █ █ ██ ██ ██ ██
  87. ██ ██ ██ ██ ██
  88. """
  89. lection_exist = False
  90. variants_exist = False
  91. in_lections = False # начало поиска вариантов
  92. readme_list = readme_html.split("\n")
  93. for readme_str in readme_list:
  94. readme_str = readme_str.strip()
  95. readme_str_list = readme_str.split(" ")
  96. lection_number = readme_str_list[0]
  97. readme_str_list.pop(0)
  98. name_str = " ".join(readme_str_list)
  99. name_str = name_str.replace(".", "")
  100. name_str = name_str.strip()
  101. if len(name_str)>0:
  102. """
  103. print(lection_number)
  104. print(name_str)
  105. print(header_text)
  106. print(f"{ord(name_str[0:1])} {ord(name_str[1:2])} {ord(name_str[2:3])} вместо {ord(header_text[0:1])} {ord(header_text[1:2])} {ord(header_text[2:3])}")
  107. print(fuzz.partial_ratio(name_str, header_text))
  108. print()
  109. """
  110. if (str(name_str).lower() == str(header_text).lower()):
  111. print("Лекция найдена в readme")
  112. lection_exist = True
  113. in_lections = True
  114. post_tokens, post_uniq_text = preprocess_text(post_html)
  115. print(f"количество уникальных слов: {len(set(post_tokens))}")
  116. print()
  117. # ищем конец списка вариантов лекций (пустая строка)
  118. if lection_exist:
  119. if (readme_str == ""):
  120. in_lections = False
  121. # следующие после названия лекции строки
  122. if in_lections and (str(name_str).lower() != str(header_text).lower()):
  123. variants_exist = True
  124. variant_name, t = readme_str.split("]")
  125. variant_name = variant_name.strip("[")
  126. print(f"проверяю {variant_name}")
  127. t, variant_uri = readme_str.split("(")
  128. variant_uri = variant_uri.replace("),", "")
  129. variant_uri = variant_uri.replace(")", "")
  130. variant_uri = variant_uri.strip()
  131. variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
  132. try:
  133. with open(variant_path, encoding="utf-8") as f:
  134. variant_html = f.read()
  135. except:
  136. with open(variant_path, encoding="cp1251") as f:
  137. variant_html = f.read()
  138. variant_tokens, variant_uniq_text = preprocess_text(variant_html)
  139. print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
  140. # пересечение множеств
  141. min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])
  142. c = list(set(post_tokens) & set(variant_tokens))
  143. ratio = (1 - (len(c) / min_tokens_len)) * 100
  144. print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")
  145. print()
  146. if not(lection_exist):
  147. print("Лекция не найдена в readme")
  148. if not(variants_exist):
  149. print("Вариантов не найдено")
  150. exit()
  151. files_paths = []
  152. dirs = os.listdir(BASE_DIR)
  153. for dir in dirs:
  154. dir_path = os.path.join(BASE_DIR, dir)
  155. if os.path.isdir(dir_path) and (dir != "__pycache__"):
  156. files = os.listdir(dir_path)
  157. for file in files:
  158. file_path = os.path.join(BASE_DIR, dir, file)
  159. filename, fileext = os.path.splitext(file)
  160. if os.path.isfile(file_path) and (fileext=='.md'):
  161. files_paths.append(file_path)
  162. out_str = ""
  163. max_ratio = 0
  164. max_ratio_file = ""
  165. for file_1 in tqdm(files_paths):
  166. small_filename_1 = str(file_1).replace(BASE_DIR, "").strip("\\")
  167. try:
  168. with open(file_1, encoding="utf-8") as f_1:
  169. str1 = f_1.read()
  170. except:
  171. with open(file_1, encoding="cp1251") as f_1:
  172. str1 = f_1.read()
  173. f_1.close()
  174. with open(file_1, 'w', encoding="utf-8") as f_1:
  175. f_1.write(str1)
  176. f_1.close()
  177. ratio = int(SequenceMatcher(None, str1.lower(), post_html.lower()).ratio() * 100)
  178. if (ratio > 70):
  179. out_str += f"{small_filename_1}\n"
  180. out_str += f"ratio = {ratio}\n"
  181. if (ratio > max_ratio):
  182. max_ratio = ratio
  183. max_ratio_file = small_filename_1
  184. print(out_str)
  185. print()
  186. print(f"max ratio: {max_ratio}%")
  187. print(f"max ratio file: {max_ratio_file}")
  188. print("success")