# -*- encoding: utf-8 -*-
#
"""
virtual_env = os.path.expanduser('~/projects/world-it-planet/env')
activate_this = os.path.join(virtual_env, 'bin/activate_this.py')
exec(open(activate_this).read(), dict(__file__=activate_this))
"""
import os, traceback
from bs4 import BeautifulSoup
import random, time, datetime
import requests
from requests.exceptions import ProxyError
from urllib.parse import urlparse, urljoin
import tldextract
# download stopwords corpus, you need to run it once
import nltk
nltk.download("stopwords")
#--------#
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
#декларативное определение
from sqlalchemy import Column, Integer, String, Text, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.expression import func
#---------------------------------- Variables ----------
donors = [
"http://npedkol.ru/",
"https://it-events.com/",
"https://habr.com/ru/all/",
"https://innopolis.university/",
]
keywords = [
"олимпиада",
"хакатон",
"конкурс"
]
user_agents = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
"Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
"Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 YaBrowser/1.7.1364.21027 Safari/537.22",
"Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16",
"Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25",
"Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.15",
"Mozilla / 5.0 (Macintosh; Intel Mac OS X 10.14; rv: 75.0) Gecko / 20100101 Firefox / 75.0",
"Mozilla / 5.0 (Windows NT 6.1; Win64; x64; rv: 74.0) Gecko / 20100101 Firefox / 74.0",
"Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / 80.0.3987.163 Safari / 537.36",
"Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)"
]
bad_file_extensions = ["pdf", "doc", "docx"]
#---------------------------------- Variables End ----------
out_s = ""
#Инициализация SQLLite
basedir = os.path.abspath(os.path.dirname(__file__))
SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, 'all.db')
engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True)
Base = declarative_base()
# класс БД собранной информации
class Links(Base):
__tablename__ = 'links'
id = Column(Integer, primary_key=True, autoincrement=True)
donor = Column(String(255))
title = Column(String(512))
href = Column(String(512))
parse_date = Column(Integer)
html = Column(Text)
text = Column(Text)
lemmas = Column(Text) # набор лемм из текста (мешок слов)
level = Column(Integer) # уровень вложенности ссылки от корня сайта
status = Column(Integer) # 0 - не загружена, 1 - загружена, ключевика нет; 2 - ключевик есть, уведомление не отправлено; 3 - уведомление не отправлено
def __init__(self, donor, title, href, parse_date, html, text, lemmas, level, status):
self.donor = donor
self.title = title
self.href = href
self.parse_date = parse_date
self.html = html
self.text = text
self.lemmas = lemmas
self.level = level
self.status = status
def __repr__(self):
return "" % (self.title, self.href)
class Log(Base):
__tablename__ = 'log'
id = Column(Integer, primary_key=True, autoincrement=True)
action = Column(String(64))
status = Column(String(64))
time = Column(Integer)
donor = Column(String(64))
def __init__(self, action, status, time, donor):
self.action = action
self.status = status
self.time = time
self.donor = donor
def __repr__(self):
return "" % (self.action, self.status)
# Создание таблицы
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
sqllite_session = Session()
def select_good_link(link_href, donor):
is_bad = False
donor_parsed = urlparse(donor)
link_parsed = urlparse(link_href)
# приводим ссылку к каноничному виду
if (link_parsed.hostname == None):
link_2 = donor_parsed.scheme + "://" + donor_parsed.hostname
link_2 = urljoin(link_2, link_href)
else:
link_2 = link_href
# нас интересуют только внутренние ссылки (и с поддоменов тоже)
donor_domain = tldextract.extract(donor).domain
link_domain = tldextract.extract(link_2).domain
print("link_domain: ", link_domain)
if (link_domain != donor_domain):
print("Внешняя ссылка, пропускаем")
is_bad = True
# убираем .pdf, mailto и пр.
pos = str(link_2).find("mailto")
if (pos != -1):
print("mailto, пропускаем")
is_bad = True
# GET переменные не нужны
try:
link_2, get = str(link_2).split("?")
except:
pass
filename, file_extension = os.path.splitext(link_2)
if (file_extension in bad_file_extensions):
print(file_extension, ", пропускаем")
is_bad = True
# сам домен тоже не нужен
if (link_2 == donor):
print("Главная страница, пропускаем")
is_bad = True
return is_bad, link_2
# парсим по 10 ссылок за раз
for i in range(0, 10):
link = sqllite_session.query(Links).filter(Links.status == 0).order_by(func.random()).first()
print("Парсим ", link.href)
#формируем запрос
user_agent = random.choice(user_agents)
donor_parsed = urlparse(link.href)
headers = {
"Host": str(donor_parsed.hostname),
'User-Agent': str(user_agent),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': str(link.href),
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive'}
try:
response = requests.get(link.href, headers=headers)
print("status_code: ", response.status_code)
if (response.status_code == 200):
link.html = response.text
soup = BeautifulSoup(response.text, "lxml")
title = str(soup.find('title').string.strip())
print("title: ", title)
link.title = title
text = soup.get_text()
link.text = text
except:
print("Ошибка, пропускаем")
print()