před 3 roky · 47369a6eee
--- a/README.md
+++ b/README.md
@@ -17,4 +17,6 @@ parser.py - парсим текст с найденных ссылок
 
				 pip install sqlalchemy  
			
 
				 pip install bs4  
			
 
				 pip install lxml   
			
 
				-pip install tldextract   
			
 
				+pip install tldextract   
			
 
				+pip install nltk  
			
 
				+pip install pymystem3  
			
--- a/all.db-journal
+++ b/all.db-journal
--- a/parser.py
+++ b/parser.py
@@ -15,6 +15,16 @@ from requests.exceptions import ProxyError
 
				 from urllib.parse import urlparse, urljoin
			
 
				 import tldextract
			
 
				 
			
 
				+# download stopwords corpus, you need to run it once
			
 
				+import nltk
			
 
				+nltk.download("stopwords")
			
 
				+#--------#
			
 
				+
			
 
				+from nltk.corpus import stopwords
			
 
				+from pymystem3 import Mystem
			
 
				+from string import punctuation
			
 
				+
			
 
				+
			
 
				 #декларативное определение
			
 
				 from sqlalchemy import Column, Integer, String, Text, create_engine
			
 
				 from sqlalchemy.ext.declarative import declarative_base
			
@@ -195,8 +205,15 @@ for i in range(0, 10):
 
				         if (response.status_code == 200):
			
 
				             link.html = response.text
			
 
				             soup = BeautifulSoup(response.text, "lxml")
			
 
				-            title = soup.find('title').string.strip()
			
 
				+            title = str(soup.find('title').string.strip())
			
 
				             print("title: ", title)
			
 
				+            link.title = title
			
 
				+
			
 
				+            text = soup.get_text()
			
 
				+            link.text = text
			
 
				+
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				     except: