3 年之前 · fa2c7d67da
--- a/parser.py
+++ b/parser.py
@@ -134,6 +134,23 @@ Session = sessionmaker(bind=engine)
 
				 sqllite_session = Session()
			
 
				 
			
 
				 
			
 
				+# https://www.kaggle.com/code/alxmamaev/how-to-easy-preprocess-russian-text/script
			
 
				+
			
 
				+#Create lemmatizer and stopwords list
			
 
				+mystem = Mystem() 
			
 
				+russian_stopwords = stopwords.words("russian")
			
 
				+
			
 
				+#Preprocess function
			
 
				+def preprocess_text(text):
			
 
				+    tokens = mystem.lemmatize(text.lower())
			
 
				+    tokens = [token for token in tokens if token not in russian_stopwords\
			
 
				+            and token != " " \
			
 
				+            and token.strip() not in punctuation]
			
 
				+    
			
 
				+    text = " ".join(tokens)
			
 
				+    
			
 
				+    return text
			
 
				+
			
 
				 
			
 
				 # парсим по случайных 10 ссылок за раз
			
 
				 for i in range(0, 10):
			
@@ -154,25 +171,22 @@ for i in range(0, 10):
 
				             'Upgrade-Insecure-Requests': '1',
			
 
				             'Connection': 'keep-alive'}
			
 
				 
			
 
				-    try:
			
 
				-        response = requests.get(link.href, headers=headers)
			
 
				-        print("status_code: ", response.status_code)
			
 
				-        if (response.status_code == 200):
			
 
				-            link.html = response.text
			
 
				-            soup = BeautifulSoup(response.text, "lxml")
			
 
				-            title = str(soup.find('title').string.strip())
			
 
				-            print("title: ", title)
			
 
				-            link.title = title
			
 
				-
			
 
				-            text = soup.get_text()
			
 
				-            link.text = text
			
 
				-
			
 
				-            # https://www.kaggle.com/code/alxmamaev/how-to-easy-preprocess-russian-text/script
			
 
				+    response = requests.get(link.href, headers=headers)
			
 
				+    print("status_code: ", response.status_code)
			
 
				+    if (response.status_code == 200):
			
 
				+        link.html = response.text
			
 
				+        soup = BeautifulSoup(response.text, "lxml")
			
 
				+        title = str(soup.find('title').string.strip())
			
 
				+        print("title: ", title)
			
 
				+        link.title = title
			
 
				 
			
 
				+        text = soup.get_text()
			
 
				+        link.text = text
			
 
				 
			
 
				+        lemmas = preprocess_text(text)
			
 
				+        print(lemmas)
			
 
				+        print()
			
 
				 
			
 
				-    except:
			
 
				-        print("Ошибка, пропускаем")
			
 
				 
			
 
				 
			
 
				     print()