|
@@ -15,6 +15,16 @@ from requests.exceptions import ProxyError
|
|
|
from urllib.parse import urlparse, urljoin
|
|
|
import tldextract
|
|
|
|
|
|
+# download stopwords corpus, you need to run it once
|
|
|
+import nltk
|
|
|
+nltk.download("stopwords")
|
|
|
+#--------#
|
|
|
+
|
|
|
+from nltk.corpus import stopwords
|
|
|
+from pymystem3 import Mystem
|
|
|
+from string import punctuation
|
|
|
+
|
|
|
+
|
|
|
#декларативное определение
|
|
|
from sqlalchemy import Column, Integer, String, Text, create_engine
|
|
|
from sqlalchemy.ext.declarative import declarative_base
|
|
@@ -195,8 +205,15 @@ for i in range(0, 10):
|
|
|
if (response.status_code == 200):
|
|
|
link.html = response.text
|
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
|
- title = soup.find('title').string.strip()
|
|
|
+ title = str(soup.find('title').string.strip())
|
|
|
print("title: ", title)
|
|
|
+ link.title = title
|
|
|
+
|
|
|
+ text = soup.get_text()
|
|
|
+ link.text = text
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
|
|
|
except:
|