| 
					
				 | 
			
			
				@@ -4,6 +4,42 @@ from tqdm import tqdm 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import datetime 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import requests 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# download stopwords corpus, you need to run it once 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import nltk 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#nltk.download("stopwords") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from nltk.corpus import stopwords 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import pymorphy2 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from string import punctuation 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#Create lemmatizer and stopwords list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+morph = pymorphy2.MorphAnalyzer() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+russian_stopwords = stopwords.words("russian") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#Preprocess function 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def preprocess_text(text): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    translator = str.maketrans(punctuation, ' '*len(punctuation)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    words = text.translate(translator) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    words = words.lower().split() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # очистка от прилегающего к слову мусора (слово, "или так") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    clear_words = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for word in words: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        clear_word = "" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for s in word: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if not s in punctuation: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                clear_word = clear_word + s 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        clear_words.append(clear_word) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    tokens = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    tokens = [morph.parse(token)[0].normal_form for token in clear_words if token not in russian_stopwords\ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            and token != " " \ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            and token.strip() not in punctuation \ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    text = " ".join(tokens)     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return tokens, text 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 # директория файла 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 BASE_DIR = os.path.abspath(os.path.dirname(__file__)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -17,7 +53,77 @@ response = requests.get(url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 post_html = response.text 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 post_list = post_html.split("\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-print(post_list[0]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# проверяем правильность оформления 1й строки 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+line_1 = post_list[0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+if (line_1[0]) != "#": 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print("Заголовок статьи не найден") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+header_text = line_1.replace("# ", "") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+header_text = header_text.replace(".", "") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+header_text = header_text.strip() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# ищем другие лекции по этой теме 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+readme_path = os.path.join(BASE_DIR, "README.md") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    with open(readme_path, encoding="utf-8") as f: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        readme_html = f.read() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    with open(readme_path, encoding="cp1251") as f: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        readme_html = f.read() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+lection_exist = False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+readme_list = readme_html.split("\n") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+for readme_str in readme_list: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    readme_str = readme_str.strip() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    readme_str_list = readme_str.split(" ") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    readme_str_list.pop(0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    name_str = " ".join(readme_str_list) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    name_str = name_str.replace(".", "") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if (str(name_str) == str(header_text)): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print("Лекция найдена") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        lection_exist = True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        post_tokens, post_uniq_text = preprocess_text(post_html) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(f"количество уникальных слов: {len(set(post_tokens))}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # ищем конец списка вариантов лекций (пустая строка) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if lection_exist: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if (readme_str == ""): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            lection_exist = False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # следующие после названия лекции строки 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if lection_exist and (str(name_str) != str(header_text)): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        variant_name, t = readme_str.split("]") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        variant_name = variant_name.strip("[") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(f"проверяю {variant_name}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        t, variant_uri = readme_str.split("(") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        variant_uri = variant_uri.replace("),", "") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        variant_uri = variant_uri.strip() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+         
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        variant_path = os.path.join(BASE_DIR, variant_uri) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            with open(variant_path, encoding="utf-8") as f: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                variant_html = f.read() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            with open(variant_path, encoding="cp1251") as f: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                variant_html = f.read() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        variant_tokens, variant_uniq_text = preprocess_text(variant_html) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(f"количество уникальных слов варианта: {len(set(variant_tokens))}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # пересечение множеств  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        c = list(set(post_tokens) & set(variant_tokens)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ratio = 1 - (len(c) / len(set(post_tokens))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print(f"количество совпадающих слов: {len(c)} / {ratio}%") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 exit() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 files_paths = [] 
			 |