|
@@ -134,6 +134,23 @@ Session = sessionmaker(bind=engine)
|
|
|
sqllite_session = Session()
|
|
|
|
|
|
|
|
|
+# https://www.kaggle.com/code/alxmamaev/how-to-easy-preprocess-russian-text/script
|
|
|
+
|
|
|
+#Create lemmatizer and stopwords list
|
|
|
+mystem = Mystem()
|
|
|
+russian_stopwords = stopwords.words("russian")
|
|
|
+
|
|
|
+#Preprocess function
|
|
|
+def preprocess_text(text):
|
|
|
+ tokens = mystem.lemmatize(text.lower())
|
|
|
+ tokens = [token for token in tokens if token not in russian_stopwords\
|
|
|
+ and token != " " \
|
|
|
+ and token.strip() not in punctuation]
|
|
|
+
|
|
|
+ text = " ".join(tokens)
|
|
|
+
|
|
|
+ return text
|
|
|
+
|
|
|
|
|
|
# парсим по случайных 10 ссылок за раз
|
|
|
for i in range(0, 10):
|
|
@@ -154,25 +171,22 @@ for i in range(0, 10):
|
|
|
'Upgrade-Insecure-Requests': '1',
|
|
|
'Connection': 'keep-alive'}
|
|
|
|
|
|
- try:
|
|
|
- response = requests.get(link.href, headers=headers)
|
|
|
- print("status_code: ", response.status_code)
|
|
|
- if (response.status_code == 200):
|
|
|
- link.html = response.text
|
|
|
- soup = BeautifulSoup(response.text, "lxml")
|
|
|
- title = str(soup.find('title').string.strip())
|
|
|
- print("title: ", title)
|
|
|
- link.title = title
|
|
|
-
|
|
|
- text = soup.get_text()
|
|
|
- link.text = text
|
|
|
-
|
|
|
- # https://www.kaggle.com/code/alxmamaev/how-to-easy-preprocess-russian-text/script
|
|
|
+ response = requests.get(link.href, headers=headers)
|
|
|
+ print("status_code: ", response.status_code)
|
|
|
+ if (response.status_code == 200):
|
|
|
+ link.html = response.text
|
|
|
+ soup = BeautifulSoup(response.text, "lxml")
|
|
|
+ title = str(soup.find('title').string.strip())
|
|
|
+ print("title: ", title)
|
|
|
+ link.title = title
|
|
|
|
|
|
+ text = soup.get_text()
|
|
|
+ link.text = text
|
|
|
|
|
|
+ lemmas = preprocess_text(text)
|
|
|
+ print(lemmas)
|
|
|
+ print()
|
|
|
|
|
|
- except:
|
|
|
- print("Ошибка, пропускаем")
|
|
|
|
|
|
|
|
|
print()
|