plagiat_1.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. import os
  2. from difflib import SequenceMatcher
  3. from tqdm import tqdm
  4. import datetime
  5. import requests
  6. # download stopwords corpus, you need to run it once
  7. import nltk
  8. #nltk.download("stopwords")
  9. from nltk.corpus import stopwords
  10. import pymorphy2
  11. from string import punctuation
  12. #Create lemmatizer and stopwords list
  13. morph = pymorphy2.MorphAnalyzer()
  14. russian_stopwords = stopwords.words("russian")
  15. #Preprocess function
  16. def preprocess_text(text):
  17. translator = str.maketrans(punctuation, ' '*len(punctuation))
  18. words = text.translate(translator)
  19. words = words.lower().split()
  20. # очистка от прилегающего к слову мусора (слово, "или так")
  21. clear_words = []
  22. for word in words:
  23. clear_word = ""
  24. for s in word:
  25. if not s in punctuation:
  26. clear_word = clear_word + s
  27. clear_words.append(clear_word)
  28. tokens = []
  29. tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
  30. and token != " " \
  31. and token.strip() not in punctuation \
  32. ]
  33. text = " ".join(tokens)
  34. return tokens, text
  35. # директория файла
  36. BASE_DIR = os.path.abspath(os.path.dirname(__file__))
  37. # ссылка для проверки
  38. url = "http://213.155.192.79:3001/ypv/up/raw/master/%d0%ad%d0%90%d0%a1%d0%b2%d0%97%d0%98/%d0%9b%d0%b5%d0%ba%d1%86%d0%b8%d0%b8/1.3.300_%d0%9a%d1%80%d0%b8%d1%82%d0%b5%d1%80%d0%b8%d0%b8_%d0%ba%d0%bb%d0%b0%d1%81%d1%81%d0%b8%d1%84%d0%b8%d0%ba%d0%b0%d1%86%d0%b8%d0%b8_%d1%83%d0%b3%d1%80%d0%be%d0%b7/Doc.md"
  39. print()
  40. response = requests.get(url)
  41. post_html = response.text
  42. post_list = post_html.split("\n")
  43. # проверяем правильность оформления 1й строки
  44. line_1 = post_list[0]
  45. if (line_1[0]) != "#":
  46. print("Заголовок статьи не найден")
  47. header_text = line_1.replace("# ", "")
  48. header_text = header_text.replace(".", "")
  49. header_text = header_text.strip()
  50. # ищем другие лекции по этой теме
  51. readme_path = os.path.join(BASE_DIR, "README.md")
  52. try:
  53. with open(readme_path, encoding="utf-8") as f:
  54. readme_html = f.read()
  55. except:
  56. with open(readme_path, encoding="cp1251") as f:
  57. readme_html = f.read()
  58. lection_exist = False
  59. readme_list = readme_html.split("\n")
  60. for readme_str in readme_list:
  61. readme_str = readme_str.strip()
  62. readme_str_list = readme_str.split(" ")
  63. readme_str_list.pop(0)
  64. name_str = " ".join(readme_str_list)
  65. name_str = name_str.replace(".", "")
  66. if (str(name_str) == str(header_text)):
  67. print("Лекция найдена")
  68. lection_exist = True
  69. post_tokens, post_uniq_text = preprocess_text(post_html)
  70. print(f"количество уникальных слов: {len(set(post_tokens))}")
  71. print()
  72. # ищем конец списка вариантов лекций (пустая строка)
  73. if lection_exist:
  74. if (readme_str == ""):
  75. lection_exist = False
  76. # следующие после названия лекции строки
  77. if lection_exist and (str(name_str) != str(header_text)):
  78. variant_name, t = readme_str.split("]")
  79. variant_name = variant_name.strip("[")
  80. print(f"проверяю {variant_name}")
  81. t, variant_uri = readme_str.split("(")
  82. variant_uri = variant_uri.replace("),", "")
  83. variant_uri = variant_uri.strip()
  84. variant_path = os.path.join(BASE_DIR, variant_uri)
  85. try:
  86. with open(variant_path, encoding="utf-8") as f:
  87. variant_html = f.read()
  88. except:
  89. with open(variant_path, encoding="cp1251") as f:
  90. variant_html = f.read()
  91. variant_tokens, variant_uniq_text = preprocess_text(variant_html)
  92. print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
  93. # пересечение множеств
  94. c = list(set(post_tokens) & set(variant_tokens))
  95. ratio = 1 - (len(c) / len(set(post_tokens)))
  96. print(f"количество совпадающих слов: {len(c)} / {ratio}%")
  97. print()
  98. exit()
  99. files_paths = []
  100. dirs = os.listdir(BASE_DIR)
  101. for dir in dirs:
  102. dir_path = os.path.join(BASE_DIR, dir)
  103. if os.path.isdir(dir_path) and (dir != "__pycache__"):
  104. files = os.listdir(dir_path)
  105. for file in files:
  106. file_path = os.path.join(BASE_DIR, dir, file)
  107. filename, fileext = os.path.splitext(file)
  108. if os.path.isfile(file_path) and (fileext=='.md'):
  109. files_paths.append(file_path)
  110. out_str = ""
  111. max_ratio = 0
  112. max_ratio_file = ""
  113. for file_1 in tqdm(files_paths):
  114. small_filename_1 = str(file_1).replace(BASE_DIR, "").strip("\\")
  115. try:
  116. with open(file_1, encoding="utf-8") as f_1:
  117. str1 = f_1.read()
  118. except:
  119. with open(file_1, encoding="cp1251") as f_1:
  120. str1 = f_1.read()
  121. f_1.close()
  122. with open(file_1, 'w', encoding="utf-8") as f_1:
  123. f_1.write(str1)
  124. f_1.close()
  125. ratio = int(SequenceMatcher(None, str1.lower(), post_html.lower()).ratio() * 100)
  126. if (ratio > 70):
  127. out_str += f"{small_filename_1}\n"
  128. out_str += f"ratio = {ratio}\n"
  129. if (ratio > max_ratio):
  130. max_ratio = ratio
  131. max_ratio_file = small_filename_1
  132. print(out_str)
  133. print()
  134. print(f"max ratio: {max_ratio}%")
  135. print(f"max ratio file: {max_ratio_file}")
  136. print("success")