瀏覽代碼

пример лемматизации MyStem

ypv 3 年之前
父節點
當前提交
fa2c7d67da
共有 1 個文件被更改,包括 30 次插入16 次删除
  1. 30 16
      parser.py

+ 30 - 16
parser.py

@@ -134,6 +134,23 @@ Session = sessionmaker(bind=engine)
 sqllite_session = Session()
 
 
+# https://www.kaggle.com/code/alxmamaev/how-to-easy-preprocess-russian-text/script
+
+#Create lemmatizer and stopwords list
+mystem = Mystem() 
+russian_stopwords = stopwords.words("russian")
+
+#Preprocess function
+def preprocess_text(text):
+    tokens = mystem.lemmatize(text.lower())
+    tokens = [token for token in tokens if token not in russian_stopwords\
+            and token != " " \
+            and token.strip() not in punctuation]
+    
+    text = " ".join(tokens)
+    
+    return text
+
 
 # парсим по случайных 10 ссылок за раз
 for i in range(0, 10):
@@ -154,25 +171,22 @@ for i in range(0, 10):
             'Upgrade-Insecure-Requests': '1',
             'Connection': 'keep-alive'}
 
-    try:
-        response = requests.get(link.href, headers=headers)
-        print("status_code: ", response.status_code)
-        if (response.status_code == 200):
-            link.html = response.text
-            soup = BeautifulSoup(response.text, "lxml")
-            title = str(soup.find('title').string.strip())
-            print("title: ", title)
-            link.title = title
-
-            text = soup.get_text()
-            link.text = text
-
-            # https://www.kaggle.com/code/alxmamaev/how-to-easy-preprocess-russian-text/script
+    response = requests.get(link.href, headers=headers)
+    print("status_code: ", response.status_code)
+    if (response.status_code == 200):
+        link.html = response.text
+        soup = BeautifulSoup(response.text, "lxml")
+        title = str(soup.find('title').string.strip())
+        print("title: ", title)
+        link.title = title
 
+        text = soup.get_text()
+        link.text = text
 
+        lemmas = preprocess_text(text)
+        print(lemmas)
+        print()
 
-    except:
-        print("Ошибка, пропускаем")
 
 
     print()