|
@@ -11,7 +11,9 @@ from bs4 import BeautifulSoup
|
|
|
import random, time, datetime
|
|
|
import requests
|
|
|
from requests.exceptions import ProxyError
|
|
|
-from urllib.parse import urlparse
|
|
|
+
|
|
|
+from urllib.parse import urlparse, urljoin
|
|
|
+import tldextract
|
|
|
|
|
|
#декларативное определение
|
|
|
from sqlalchemy import Column, Integer, String, Text, create_engine
|
|
@@ -21,10 +23,10 @@ from sqlalchemy.orm import sessionmaker
|
|
|
#---------------------------------- Variables ----------
|
|
|
|
|
|
donors = [
|
|
|
- "https://it-events.com/",
|
|
|
- "https://habr.com/ru/all/",
|
|
|
"http://npedkol.ru/",
|
|
|
- "https://innopolis.university/",
|
|
|
+ #"https://it-events.com/",
|
|
|
+ #"https://habr.com/ru/all/",
|
|
|
+ #"https://innopolis.university/",
|
|
|
]
|
|
|
|
|
|
keywords = [
|
|
@@ -54,6 +56,7 @@ user_agents = [
|
|
|
"Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)"
|
|
|
]
|
|
|
|
|
|
+bad_file_extensions = ["pdf", "doc", "docx"]
|
|
|
|
|
|
#---------------------------------- Variables End ----------
|
|
|
|
|
@@ -77,16 +80,18 @@ class Links(Base):
|
|
|
parse_date = Column(Integer)
|
|
|
html = Column(Text)
|
|
|
text = Column(Text)
|
|
|
+ lemmas = Column(Text) # набор лемм из текста (мешок слов)
|
|
|
level = Column(Integer) # уровень вложенности ссылки от корня сайта
|
|
|
status = Column(Integer) # 0 - не загружена, 1 - загружена, ключевика нет; 2 - ключевик есть, уведомление не отправлено; 3 - уведомление не отправлено
|
|
|
|
|
|
- def __init__(self, donor, title, href, parse_date, html, text, level, status):
|
|
|
+ def __init__(self, donor, title, href, parse_date, html, text, lemmas, level, status):
|
|
|
self.donor = donor
|
|
|
self.title = title
|
|
|
self.href = href
|
|
|
self.parse_date = parse_date
|
|
|
self.html = html
|
|
|
self.text = text
|
|
|
+ self.lemmas = lemmas
|
|
|
self.level = level
|
|
|
self.status = status
|
|
|
|
|
@@ -114,11 +119,52 @@ class Log(Base):
|
|
|
# Создание таблицы
|
|
|
Base.metadata.create_all(engine)
|
|
|
|
|
|
-
|
|
|
Session = sessionmaker(bind=engine)
|
|
|
sqllite_session = Session()
|
|
|
|
|
|
|
|
|
+def select_good_link(link_href, donor):
|
|
|
+ is_bad = False
|
|
|
+ donor_parsed = urlparse(donor)
|
|
|
+ link_parsed = urlparse(link_href)
|
|
|
+
|
|
|
+ # приводим ссылку к каноничному виду
|
|
|
+ if (link_parsed.hostname == None):
|
|
|
+ link_2 = donor_parsed.scheme + "://" + donor_parsed.hostname
|
|
|
+ link_2 = urljoin(link_2, link_href)
|
|
|
+ else:
|
|
|
+ link_2 = link_href
|
|
|
+
|
|
|
+ # нас интересуют только внутренние ссылки (и с поддоменов тоже)
|
|
|
+ donor_domain = tldextract.extract(donor).domain
|
|
|
+ link_domain = tldextract.extract(link_2).domain
|
|
|
+ print("link_domain: ", link_domain)
|
|
|
+ if (link_domain != donor_domain):
|
|
|
+ print("Внешняя ссылка, пропускаем")
|
|
|
+ is_bad = True
|
|
|
+
|
|
|
+ # убираем .pdf, mailto и пр.
|
|
|
+ pos = str(link_2).find("mailto")
|
|
|
+ if (pos != -1):
|
|
|
+ print("mailto, пропускаем")
|
|
|
+ is_bad = True
|
|
|
+
|
|
|
+ filename, file_extension = os.path.splitext(link_2)
|
|
|
+ if (file_extension in bad_file_extensions):
|
|
|
+ print(file_extension, ", пропускаем")
|
|
|
+ is_bad = True
|
|
|
+
|
|
|
+ # GET переменные не нужны
|
|
|
+ try:
|
|
|
+ link_2, get = str(link_2).split("?")
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
+ return is_bad, link_2
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
for donor in donors:
|
|
|
print("Парсим ", donor)
|
|
|
#формируем запрос
|
|
@@ -139,39 +185,47 @@ for donor in donors:
|
|
|
response = requests.get(donor, headers=headers)
|
|
|
post_html = response.text
|
|
|
|
|
|
- with open(os.path.join(basedir, 'habr.html'), 'w', encoding="utf-8") as f:
|
|
|
+ with open(os.path.join(basedir, donor_parsed.hostname+'.html'), 'w', encoding="utf-8") as f:
|
|
|
f.write(post_html)
|
|
|
|
|
|
- #Парсим ссылки habr.com
|
|
|
+ # Парсим ссылки
|
|
|
soup = BeautifulSoup(post_html, "lxml")
|
|
|
all_links = soup.find_all('a')
|
|
|
for link in all_links:
|
|
|
- print(link)
|
|
|
- """
|
|
|
- link_parsed =
|
|
|
- link_href = str(vacancy.find('a', {'class': 'tm-article-snippet__title-link'}).get('href')).strip()
|
|
|
- link_href = "https://habr.com" + link_href
|
|
|
+ link_href = link.get('href')
|
|
|
+ print(link_href)
|
|
|
+
|
|
|
+ is_bad, link_2 = select_good_link(link_href, donor)
|
|
|
+ if (is_bad):
|
|
|
+ print()
|
|
|
+ continue
|
|
|
+
|
|
|
+ print("link_2: ", link_2)
|
|
|
|
|
|
|
|
|
#уникальность ссылки
|
|
|
- out_s += "Проверяем уникальность\n" + "<br>"
|
|
|
- link_n = sqllite_session.query(Links).filter(Links.donor_link_id == donor_link_id).count()
|
|
|
- out_s += "link_n: " + str(link_n) + "<br>"
|
|
|
+ print("Проверяем уникальность")
|
|
|
+ link_n = sqllite_session.query(Links).filter(Links.href == link_2).count()
|
|
|
+ print("link_n: " + str(link_n))
|
|
|
if (link_n == 0):
|
|
|
- out_s += "Добавляем в базу" + "<br>"
|
|
|
+ print("Добавляем в базу")
|
|
|
new_link = Links(
|
|
|
- title = link_title,
|
|
|
+ title = "",
|
|
|
href = link_href,
|
|
|
- donor = 'habr.ru',
|
|
|
- donor_link_id = donor_link_id,
|
|
|
+ donor = donor,
|
|
|
parse_date = int(time.time()),
|
|
|
- text = ""
|
|
|
- )
|
|
|
+ html = "",
|
|
|
+ text = "",
|
|
|
+ lemmas = "",
|
|
|
+ level = 1,
|
|
|
+ status = 0
|
|
|
+ )
|
|
|
sqllite_session.add(new_link)
|
|
|
else:
|
|
|
- out_s += "В базе ссылка есть" + "<br>"
|
|
|
- """
|
|
|
-
|
|
|
+ print("В базе ссылка есть")
|
|
|
+
|
|
|
+ print ()
|
|
|
+
|
|
|
"""
|
|
|
new_log = Log(
|
|
|
action = "parse",
|