plagiat_1.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. import os
  2. from difflib import SequenceMatcher
  3. from tqdm import tqdm
  4. import datetime
  5. import requests
  6. # download stopwords corpus, you need to run it once
  7. import nltk
  8. #nltk.download("stopwords")
  9. from nltk.corpus import stopwords
  10. import pymorphy2
  11. from string import punctuation
  12. #Create lemmatizer and stopwords list
  13. morph = pymorphy2.MorphAnalyzer()
  14. russian_stopwords = stopwords.words("russian")
  15. #Preprocess function
  16. def preprocess_text(text):
  17. translator = str.maketrans(punctuation, ' '*len(punctuation))
  18. words = text.translate(translator)
  19. words = words.lower().split()
  20. # очистка от прилегающего к слову мусора (слово, "или так")
  21. clear_words = []
  22. for word in words:
  23. clear_word = ""
  24. for s in word:
  25. if not s in punctuation:
  26. clear_word = clear_word + s
  27. clear_words.append(clear_word)
  28. tokens = []
  29. tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
  30. and token != " " \
  31. and token.strip() not in punctuation \
  32. ]
  33. text = " ".join(tokens)
  34. return tokens, text
  35. # директория файла
  36. BASE_DIR = os.path.abspath(os.path.dirname(__file__))
  37. # ссылка для проверки
  38. url = "http://213.155.192.79:3001/ypv/up/raw/master/%d0%ad%d0%90%d0%a1%d0%b2%d0%97%d0%98/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.3.300_%d0%9a%d1%80%d0%b8%d1%82%d0%b5%d1%80%d0%b8%d0%b8_%d0%ba%d0%bb%d0%b0%d1%81%d1%81%d0%b8%d1%84%d0%b8%d0%ba%d0%b0%d1%86%d0%b8%d0%b8_%d1%83%d0%b3%d1%80%d0%be%d0%b7/Doc.md"
  39. who = "Савкин С."
  40. now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
  41. out_str = f"Проверка: {who}, время проверки: {now} \n"
  42. response = requests.get(url)
  43. post_html = response.text
  44. post_list = post_html.split("\n")
  45. # проверяем правильность оформления 1й строки
  46. line_1 = post_list[0]
  47. if (line_1[0]) != "#":
  48. out_str += "Заголовок статьи не найден\n"
  49. header_text = line_1.replace("# ", "")
  50. header_text = header_text.replace(".", "")
  51. header_text = header_text.strip()
  52. # ищем другие лекции по этой теме
  53. readme_path = os.path.join(BASE_DIR, "README.md")
  54. try:
  55. with open(readme_path, encoding="utf-8") as f:
  56. readme_html = f.read()
  57. except:
  58. with open(readme_path, encoding="cp1251") as f:
  59. readme_html = f.read()
  60. lection_exist = False
  61. readme_list = readme_html.split("\n")
  62. for readme_str in readme_list:
  63. readme_str = readme_str.strip()
  64. readme_str_list = readme_str.split(" ")
  65. readme_str_list.pop(0)
  66. name_str = " ".join(readme_str_list)
  67. name_str = name_str.replace(".", "")
  68. if (str(name_str) == str(header_text)):
  69. out_str += "Лекция найдена\n"
  70. lection_exist = True
  71. post_tokens, post_uniq_text = preprocess_text(post_html)
  72. out_str += f"количество уникальных слов: {len(set(post_tokens))}\n\n"
  73. # ищем конец списка вариантов лекций (пустая строка)
  74. if lection_exist:
  75. if (readme_str == ""):
  76. lection_exist = False
  77. # следующие после названия лекции строки
  78. if lection_exist and (str(name_str) != str(header_text)):
  79. variant_name, t = readme_str.split("]")
  80. variant_name = variant_name.strip("[")
  81. out_str += f"проверяю {variant_name}\n"
  82. t, variant_uri = readme_str.split("(")
  83. variant_uri = variant_uri.replace("),", "")
  84. variant_uri = variant_uri.strip()
  85. variant_path = os.path.join(BASE_DIR, variant_uri)
  86. try:
  87. with open(variant_path, encoding="utf-8") as f:
  88. variant_html = f.read()
  89. except:
  90. with open(variant_path, encoding="cp1251") as f:
  91. variant_html = f.read()
  92. variant_tokens, variant_uniq_text = preprocess_text(variant_html)
  93. out_str += f"количество уникальных слов варианта: {len(set(variant_tokens))}\n"
  94. # пересечение множеств
  95. c = list(set(post_tokens) & set(variant_tokens))
  96. ratio = 1 - (len(c) / len(set(post_tokens)))
  97. out_str += f"количество совпадающих слов: {len(c)} / {ratio}%\n\n"
  98. if not lection_exist:
  99. out_str += "Лекция НЕ найдена\n"
  100. out_str +="\n\n"
  101. print(out_str)
  102. # запись лога
  103. log_path = os.path.join(BASE_DIR, "log.md")
  104. with open(log_path, "r", encoding="utf-8") as f_log:
  105. prev_str = f_log.read()
  106. prev_str = out_str + prev_str
  107. with open(log_path, "w", encoding="utf-8") as f_log:
  108. f_log.write(prev_str)
  109. f_log.close()