před 2 roky · 79f50922ed
--- a/ЭАСвЗИ/Лекции/1.3.300_Критерии_классификации_угроз/Doc.md
+++ b/ЭАСвЗИ/Лекции/1.3.300_Критерии_классификации_угроз/Doc.md
@@ -1,4 +1,4 @@
 
				-# Критерии классификации угроз
			
 
				+# Критерии классификации угроз.
			
 
				 ### Классификация угроз безопасности
			
 
				 Под угрозой безопасности информации, понимается действие или событие, которое может привести к разрушению, искажению или несанкционированному использованию информационных ресурсов, включая хранимую, передаваемую и обрабатываемую информацию, а также программные и аппаратные средства.
			
 
				 
			
--- a/ЭАСвЗИ/Лекции/plagiat_1.py
+++ b/ЭАСвЗИ/Лекции/plagiat_1.py
@@ -4,6 +4,42 @@ from tqdm import tqdm
 
				 import datetime
			
 
				 import requests
			
 
				 
			
 
				+# download stopwords corpus, you need to run it once
			
 
				+import nltk
			
 
				+#nltk.download("stopwords")
			
 
				+from nltk.corpus import stopwords
			
 
				+import pymorphy2
			
 
				+from string import punctuation
			
 
				+
			
 
				+#Create lemmatizer and stopwords list
			
 
				+morph = pymorphy2.MorphAnalyzer()
			
 
				+russian_stopwords = stopwords.words("russian")
			
 
				+
			
 
				+#Preprocess function
			
 
				+def preprocess_text(text):
			
 
				+    translator = str.maketrans(punctuation, ' '*len(punctuation))
			
 
				+    words = text.translate(translator)
			
 
				+    words = words.lower().split()
			
 
				+    
			
 
				+    # очистка от прилегающего к слову мусора (слово, "или так")
			
 
				+    clear_words = []
			
 
				+    for word in words:
			
 
				+        clear_word = ""
			
 
				+        for s in word:
			
 
				+            if not s in punctuation:
			
 
				+                clear_word = clear_word + s
			
 
				+        clear_words.append(clear_word)
			
 
				+    tokens = []
			
 
				+    tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\
			
 
				+            and token != " " \
			
 
				+            and token.strip() not in punctuation \
			
 
				+            ]
			
 
				+
			
 
				+    text = " ".join(tokens)    
			
 
				+    return tokens, text
			
 
				+
			
 
				+
			
 
				+
			
 
				 # директория файла
			
 
				 BASE_DIR = os.path.abspath(os.path.dirname(__file__))
			
 
				 
			
@@ -17,7 +53,77 @@ response = requests.get(url)
 
				 post_html = response.text
			
 
				 
			
 
				 post_list = post_html.split("\n")
			
 
				-print(post_list[0])
			
 
				+
			
 
				+# проверяем правильность оформления 1й строки
			
 
				+line_1 = post_list[0]
			
 
				+if (line_1[0]) != "#":
			
 
				+    print("Заголовок статьи не найден")
			
 
				+
			
 
				+header_text = line_1.replace("# ", "")
			
 
				+header_text = header_text.replace(".", "")
			
 
				+header_text = header_text.strip()
			
 
				+
			
 
				+# ищем другие лекции по этой теме
			
 
				+readme_path = os.path.join(BASE_DIR, "README.md")
			
 
				+try:
			
 
				+    with open(readme_path, encoding="utf-8") as f:
			
 
				+        readme_html = f.read()
			
 
				+except:
			
 
				+    with open(readme_path, encoding="cp1251") as f:
			
 
				+        readme_html = f.read()
			
 
				+
			
 
				+lection_exist = False
			
 
				+readme_list = readme_html.split("\n")
			
 
				+for readme_str in readme_list:
			
 
				+    readme_str = readme_str.strip()
			
 
				+    readme_str_list = readme_str.split(" ")
			
 
				+    readme_str_list.pop(0)
			
 
				+    name_str = " ".join(readme_str_list)
			
 
				+    name_str = name_str.replace(".", "")
			
 
				+
			
 
				+    if (str(name_str) == str(header_text)):
			
 
				+        print("Лекция найдена")
			
 
				+        lection_exist = True
			
 
				+
			
 
				+        post_tokens, post_uniq_text = preprocess_text(post_html)
			
 
				+        print(f"количество уникальных слов: {len(set(post_tokens))}")
			
 
				+        print()
			
 
				+
			
 
				+
			
 
				+
			
 
				+    # ищем конец списка вариантов лекций (пустая строка)
			
 
				+    if lection_exist:
			
 
				+        if (readme_str == ""):
			
 
				+            lection_exist = False
			
 
				+
			
 
				+    # следующие после названия лекции строки
			
 
				+    if lection_exist and (str(name_str) != str(header_text)):
			
 
				+        variant_name, t = readme_str.split("]")
			
 
				+        variant_name = variant_name.strip("[")
			
 
				+        print(f"проверяю {variant_name}")
			
 
				+        t, variant_uri = readme_str.split("(")
			
 
				+        variant_uri = variant_uri.replace("),", "")
			
 
				+        variant_uri = variant_uri.strip()
			
 
				+        
			
 
				+        variant_path = os.path.join(BASE_DIR, variant_uri)
			
 
				+        try:
			
 
				+            with open(variant_path, encoding="utf-8") as f:
			
 
				+                variant_html = f.read()
			
 
				+        except:
			
 
				+            with open(variant_path, encoding="cp1251") as f:
			
 
				+                variant_html = f.read()
			
 
				+
			
 
				+        variant_tokens, variant_uniq_text = preprocess_text(variant_html)
			
 
				+        print(f"количество уникальных слов варианта: {len(set(variant_tokens))}")
			
 
				+
			
 
				+        # пересечение множеств 
			
 
				+        c = list(set(post_tokens) & set(variant_tokens))
			
 
				+        ratio = 1 - (len(c) / len(set(post_tokens)))
			
 
				+        print(f"количество совпадающих слов: {len(c)} / {ratio}%")
			
 
				+        print()
			
 
				+
			
 
				+
			
 
				+
			
 
				 
			
 
				 exit()
			
 
				 files_paths = []