plagiat_1_full.v2.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # Проверка одной со всеми из readme (для диф.зачетов)
  2. import os
  3. from difflib import SequenceMatcher
  4. from tqdm import tqdm
  5. import datetime
  6. import requests
  7. # download stopwords corpus, you need to run it once
  8. import nltk
  9. #nltk.download("stopwords")
  10. from nltk.corpus import stopwords
  11. import pymorphy2
  12. from string import punctuation
  13. from tqdm import tqdm
  14. # ------------------------------- НАСТРОЙКИ ------------
  15. # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
  16. BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
  17. # проверяемая директория
  18. # LECTION_DIR = os.path.join("ISRPO", "Лекции")
  19. LECTION_DIR = os.path.join("EASvZI", "Лекции")
  20. # LECTION_DIR = os.path.join("TZI", "Лекции", "ПМ3.2")
  21. # ссылка для проверки
  22. url = "http://213.155.192.79:3001/u20lebed/ISRPO/raw/88f913f3d1af9ee9bed0468b183ec9d4f6e5f2b4/2022-23/%d0%ad%d0%ba%d0%b7%d0%b0%d0%bc%d0%b5%d0%bd/Lebedkin.md"
  23. # ------------------------------- / НАСТРОЙКИ ------------
  24. #Create lemmatizer and stopwords list
  25. morph = pymorphy2.MorphAnalyzer()
  26. russian_stopwords = stopwords.words("russian")
  27. #Preprocess function
  28. def preprocess_text(text):
  29. translator = str.maketrans(punctuation, ' '*len(punctuation))
  30. words = text.translate(translator)
  31. words = words.lower().split()
  32. # очистка от прилегающего к слову мусора (слово, "или так")
  33. clear_words = []
  34. for word in words:
  35. clear_word = ""
  36. for s in word:
  37. if not s in punctuation:
  38. clear_word = clear_word + s
  39. clear_words.append(clear_word)
  40. tokens = []
  41. tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
  42. and token != " " \
  43. and token.strip() not in punctuation \
  44. ]
  45. text = " ".join(tokens)
  46. return tokens, text
  47. print()
  48. now = datetime.datetime.now().strftime('%d-%m-%Y %H:%M')
  49. out_str = f"Время проверки: {now} \n"
  50. # print(out_str)
  51. response = requests.get(url)
  52. post_html = response.text
  53. post_tokens, post_uniq_text = preprocess_text(post_html)
  54. print(f"количество уникальных слов: {len(set(post_tokens))}")
  55. print()
  56. post_list = post_html.split("\n")
  57. # ищем другие лекции
  58. readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
  59. try:
  60. with open(readme_path, encoding="utf-8") as f:
  61. readme_html = f.read()
  62. except:
  63. with open(readme_path, encoding="cp1251") as f:
  64. readme_html = f.read()
  65. min_ratio = 100
  66. min_ratio_file = ""
  67. readme_list = readme_html.split("\n")
  68. for readme_str in tqdm(readme_list):
  69. if '[' in readme_str:
  70. # print(f"проверяю {readme_str}")
  71. variant_name, t = readme_str.split("]")
  72. variant_name = variant_name.strip("[")
  73. t, variant_uri = readme_str.split("(")
  74. variant_uri = variant_uri.replace("),", "")
  75. variant_uri = variant_uri.replace(")", "")
  76. variant_uri = variant_uri.strip()
  77. variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
  78. try:
  79. with open(variant_path, encoding="utf-8") as f:
  80. variant_html = f.read()
  81. except:
  82. with open(variant_path, encoding="cp1251") as f:
  83. variant_html = f.read()
  84. variant_tokens, variant_uniq_text = preprocess_text(variant_html)
  85. # print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
  86. # пересечение множеств
  87. min_tokens_len = min([len(set(post_tokens)), len(set(variant_tokens))])
  88. c = list(set(post_tokens) & set(variant_tokens))
  89. ratio = (1 - (len(c) / min_tokens_len)) * 100
  90. if ratio < min_ratio:
  91. min_ratio = ratio
  92. min_ratio_file = variant_path
  93. # print(f"количество совпадающих слов: {len(c)} / {ratio:.2f}%")
  94. # print()
  95. print()
  96. print(f"min_ratio: {min_ratio}%")
  97. print(f"min_ratio_file: {min_ratio_file}")
  98. print("success")