plagiat_all.v3.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. # версия полной проверки с проверкой русской орфографии
  2. import os
  3. from difflib import SequenceMatcher
  4. from tqdm import tqdm
  5. import datetime
  6. import requests
  7. # download stopwords corpus, you need to run it once
  8. import nltk
  9. #nltk.download("stopwords")
  10. from nltk.corpus import stopwords
  11. import pymorphy2
  12. from string import punctuation
  13. # ------------------------------- НАСТРОЙКИ ------------
  14. # директория файла (на уровень выше, для структуры репозиториев 2 сем. 2022-23)
  15. BASE_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
  16. # проверяемая директория
  17. LECTION_DIR = os.path.join("Лекции", "ПМ3.2")
  18. final_filename = "bad_07Dec23_ПМ3.2_2.txt"
  19. start_line = 63
  20. #Create lemmatizer and stopwords list
  21. morph = pymorphy2.MorphAnalyzer()
  22. russian_stopwords = stopwords.words("russian")
  23. #Preprocess function
  24. def preprocess_text(text):
  25. translator = str.maketrans(punctuation, ' '*len(punctuation))
  26. words = text.translate(translator)
  27. words = words.lower().split()
  28. # очистка от прилегающего к слову мусора (слово, "или так")
  29. clear_words = []
  30. for word in words:
  31. clear_word = ""
  32. for s in word:
  33. if not s in punctuation:
  34. clear_word = clear_word + s
  35. clear_words.append(clear_word)
  36. tokens = []
  37. tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
  38. and token != " " \
  39. and token.strip() not in punctuation \
  40. ]
  41. text = " ".join(tokens)
  42. return tokens, text
  43. #Preprocess function
  44. import language_tool_python
  45. tool = language_tool_python.LanguageTool('ru-RU')
  46. def orfo_text(tokens):
  47. bad_tokens_n = 0
  48. for token in tokens:
  49. matches = tool.check(token)
  50. if len(matches)>0:
  51. bad_tokens_n += 1
  52. #print(matches[0].ruleId)
  53. return bad_tokens_n
  54. # ищем другие лекции по этой теме
  55. readme_path = os.path.join(BASE_DIR, LECTION_DIR, "README.md")
  56. try:
  57. with open(readme_path, encoding="utf-8") as f:
  58. readme_html = f.read()
  59. except:
  60. with open(readme_path, encoding="cp1251") as f:
  61. readme_html = f.read()
  62. """
  63. █ █ █████ ███████
  64. █ █ ██ ██ ██ ██
  65. █ █ ███████ ███████
  66. █ █ ██ ██ ██ ██
  67. ██ ██ ██ ██ ██
  68. """
  69. bad_variants_text = ""
  70. lection_name_str = ""
  71. readme_list = readme_html.split("\n")
  72. for readme_str in readme_list[start_line:]:
  73. readme_str = readme_str.strip()
  74. readme_str_list = readme_str.split(" ")
  75. if "[" in readme_str:
  76. variant_name, t = readme_str.split("]")
  77. variant_name = variant_name.strip("[")
  78. print(f"проверяю: {variant_name} / {lection_name_str}")
  79. t, variant_uri = readme_str.split("(")
  80. variant_uri = variant_uri.replace("),", "")
  81. variant_uri = variant_uri.replace(")", "")
  82. variant_uri = variant_uri.strip()
  83. if "youtube" in variant_uri:
  84. print("youtube - не проверяем")
  85. print()
  86. else:
  87. variant_path = os.path.join(BASE_DIR, LECTION_DIR, variant_uri)
  88. if os.path.isfile(variant_path):
  89. try:
  90. with open(variant_path, encoding="utf-8") as f:
  91. variant_html = f.read()
  92. except:
  93. with open(variant_path, encoding="cp1251") as f:
  94. variant_html = f.read()
  95. variant_tokens, variant_uniq_text = preprocess_text(variant_html)
  96. print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
  97. bad_tokens_n = orfo_text(variant_tokens)
  98. bad_tokens_stat = int(bad_tokens_n / len(variant_tokens) * 10000) / 100
  99. print(f"процент ошибок: {bad_tokens_stat}%")
  100. bad_variants_text += f"{lection_name_str}\n{variant_name}: {bad_tokens_stat}\n\n"
  101. else:
  102. bad_variants_text += f"!!! {lection_name_str}\n{variant_name}: Файла нет\n"
  103. with open(final_filename, "w", encoding="utf-8") as f:
  104. f.write(bad_variants_text)
  105. else:
  106. lection_name_str = readme_str