plagiat_1.v3.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. # версия проверки одной ссылки с проверкой русской орфографии
  2. import os
  3. from difflib import SequenceMatcher
  4. from tqdm import tqdm
  5. import datetime
  6. import requests
  7. # download stopwords corpus, you need to run it once
  8. import nltk
  9. #nltk.download("stopwords")
  10. from nltk.corpus import stopwords
  11. import pymorphy2
  12. from string import punctuation
  13. # ------------------------------- НАСТРОЙКИ ------------
  14. # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
  15. BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
  16. # проверяемая директория
  17. LECTION_DIR = os.path.join("Лекции", "ПМ3.1")
  18. # ссылка для проверки
  19. url = "http://213.155.192.79:3001/u21-25gavrilenko/TZI/src/f3364e43320adb5b8bd26c317d1b00fdea45997d/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/%d0%9f%d0%9c3.1%20%d0%93%d0%b0%d0%b2%d1%80%d0%b8%d0%bb%d0%b5%d0%bd%d0%ba%d0%be/%d0%93%d0%b0%d0%b2%d1%80%d0%b8%d0%bb%d0%b5%d0%bd%d0%ba%d0%be.md"
  20. # ------------------------------- / НАСТРОЙКИ ------------
  21. url = url.replace("src", "raw")
  22. #Create lemmatizer and stopwords list
  23. morph = pymorphy2.MorphAnalyzer()
  24. russian_stopwords = stopwords.words("russian")
  25. #Preprocess function
  26. def preprocess_text(text):
  27. translator = str.maketrans(punctuation, ' '*len(punctuation))
  28. words = text.translate(translator)
  29. words = words.lower().split()
  30. # очистка от прилегающего к слову мусора (слово, "или так")
  31. clear_words = []
  32. for word in words:
  33. clear_word = ""
  34. for s in word:
  35. if not s in punctuation:
  36. clear_word = clear_word + s
  37. clear_words.append(clear_word)
  38. tokens = []
  39. tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
  40. and token != " " \
  41. and token.strip() not in punctuation \
  42. ]
  43. text = " ".join(tokens)
  44. return tokens, text
  45. # Функция проверки орфографии
  46. import language_tool_python
  47. tool = language_tool_python.LanguageTool('ru-RU')
  48. def orfo_text(tokens):
  49. bad_tokens_n = 0
  50. for token in tokens:
  51. matches = tool.check(token)
  52. if len(matches)>0:
  53. bad_tokens_n += 1
  54. #print(matches[0].ruleId)
  55. return bad_tokens_n
  56. print()
  57. now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
  58. out_str = f"Время проверки: {now} \n"
  59. # print(out_str)
  60. response = requests.get(url)
  61. post_html = response.text
  62. post_list = post_html.split("\n")
  63. # проверяем правильность оформления 1й строки
  64. header_exist = True
  65. line_1 = post_list[0].strip()
  66. line_1 = line_1.replace(chr(65279), "")
  67. line_1 = line_1.replace("#", "# ")
  68. line_1 = line_1.replace(" ", " ")
  69. if (line_1[0:2]) != "# ":
  70. print(f"Заголовок статьи не найден: '{line_1[0:1]} {line_1[1:2]}' вместо '# '")
  71. print(f"{ord(line_1[0:1])} {ord(line_1[1:2])} вместо {ord('#')} {ord(' ')}")
  72. header_exist = False
  73. header_text = line_1.replace("# ", "")
  74. header_text = header_text.replace(".", "")
  75. header_text = header_text.strip()
  76. print(f"Заголовок работы: {header_text}")
  77. # наличие вопросов и списка литературы
  78. quest_exist = False
  79. source_exist = False
  80. for post_line in post_list:
  81. if (post_line[0:2] == "##"):
  82. if ("Вопросы" in post_line):
  83. quest_exist = True
  84. if ("Списoк литeрaтуры" in post_line) or ("Источники" in post_line):
  85. source_exist = True
  86. if not (quest_exist):
  87. print("Вопросы не найдены")
  88. if not (source_exist):
  89. print("Список литературы не найден")
  90. # ищем другие лекции по этой теме
  91. readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
  92. try:
  93. with open(readme_path, encoding="utf-8") as f:
  94. readme_html = f.read()
  95. except:
  96. with open(readme_path, encoding="cp1251") as f:
  97. readme_html = f.read()
  98. """
  99. █ █ █████ ███████
  100. █ █ ██ ██ ██ ██
  101. █ █ ███████ ███████
  102. █ █ ██ ██ ██ ██
  103. ██ ██ ██ ██ ██
  104. """
  105. lection_exist = False
  106. variants_exist = False
  107. in_lections = False # начало поиска вариантов
  108. readme_list = readme_html.split("\n")
  109. for readme_str in readme_list:
  110. readme_str = readme_str.strip()
  111. readme_str = readme_str.replace(" ", "")
  112. readme_str = readme_str.replace("- ", "")
  113. readme_str_list = readme_str.split(" ")
  114. lection_number = readme_str_list[0]
  115. readme_str_list.pop(0)
  116. name_str = " ".join(readme_str_list)
  117. name_str = name_str.replace(".", "")
  118. name_str = name_str.strip()
  119. if len(name_str)>0:
  120. """
  121. print(lection_number)
  122. print(name_str)
  123. print(header_text)
  124. print(f"{ord(name_str[0:1])} {ord(name_str[1:2])} {ord(name_str[2:3])} вместо {ord(header_text[0:1])} {ord(header_text[1:2])} {ord(header_text[2:3])}")
  125. print(fuzz.partial_ratio(name_str, header_text))
  126. print()
  127. """
  128. if (str(name_str).lower() == str(header_text).lower()):
  129. print("Лекция найдена в readme")
  130. lection_exist = True
  131. in_lections = True
  132. post_tokens, post_uniq_text = preprocess_text(post_html)
  133. print(f"количество уникальных слов: {len(set(post_tokens))}")
  134. bad_tokens_n = orfo_text(post_tokens)
  135. bad_tokens_stat = int(bad_tokens_n / len(post_tokens) * 10000) / 100
  136. print(f"процент ошибок: {bad_tokens_stat}%")
  137. print()
  138. # ищем конец списка вариантов лекций (пустая строка)
  139. if lection_exist:
  140. if (readme_str == ""):
  141. in_lections = False
  142. # следующие после названия лекции строки
  143. if in_lections and (str(name_str).lower() != str(header_text).lower()):
  144. variants_exist = True
  145. variant_name, t = readme_str.split("]")
  146. variant_name = variant_name.strip("[")
  147. print(f"проверяю {variant_name}")
  148. t, variant_uri = readme_str.split("(")
  149. variant_uri = variant_uri.replace("),", "")
  150. variant_uri = variant_uri.replace(")", "")
  151. variant_uri = variant_uri.strip()
  152. if "youtube" in variant_uri:
  153. print(f"youtube in {variant_uri}")
  154. print()
  155. else:
  156. variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
  157. try:
  158. with open(variant_path, encoding="utf-8") as f:
  159. variant_html = f.read()
  160. except:
  161. with open(variant_path, encoding="cp1251") as f:
  162. variant_html = f.read()
  163. variant_tokens, variant_uniq_text = preprocess_text(variant_html)
  164. print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
  165. # пересечение множеств
  166. min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])
  167. c = list(set(post_tokens) & set(variant_tokens))
  168. ratio = (1 - (len(c) / min_tokens_len)) * 100
  169. # print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")
  170. print(f"уникальность: {ratio:.2f}%")
  171. print()
  172. if not(lection_exist):
  173. print("Лекция не найдена в readme")
  174. if not(variants_exist):
  175. print("Вариантов не найдено")
  176. exit()
  177. files_paths = []
  178. dirs = os.listdir(BASE_DIR)
  179. for dir in dirs:
  180. dir_path = os.path.join(BASE_DIR, dir)
  181. if os.path.isdir(dir_path) and (dir != "__pycache__"):
  182. files = os.listdir(dir_path)
  183. for file in files:
  184. file_path = os.path.join(BASE_DIR, dir, file)
  185. filename, fileext = os.path.splitext(file)
  186. if os.path.isfile(file_path) and (fileext=='.md'):
  187. files_paths.append(file_path)
  188. out_str = ""
  189. max_ratio = 0
  190. max_ratio_file = ""
  191. for file_1 in tqdm(files_paths):
  192. small_filename_1 = str(file_1).replace(BASE_DIR, "").strip("\\")
  193. try:
  194. with open(file_1, encoding="utf-8") as f_1:
  195. str1 = f_1.read()
  196. except:
  197. with open(file_1, encoding="cp1251") as f_1:
  198. str1 = f_1.read()
  199. f_1.close()
  200. with open(file_1, 'w', encoding="utf-8") as f_1:
  201. f_1.write(str1)
  202. f_1.close()
  203. ratio = int(SequenceMatcher(None, str1.lower(), post_html.lower()).ratio() * 100)
  204. if (ratio > 70):
  205. out_str += f"{small_filename_1}\n"
  206. out_str += f"ratio = {ratio}\n"
  207. if (ratio > max_ratio):
  208. max_ratio = ratio
  209. max_ratio_file = small_filename_1
  210. print(out_str)
  211. print()
  212. print(f"max ratio: {max_ratio}%")
  213. print(f"max ratio file: {max_ratio_file}")
  214. print("success")