# -*- encoding: utf-8 -*- # """ virtual_env = os.path.expanduser('~/projects/world-it-planet/env') activate_this = os.path.join(virtual_env, 'bin/activate_this.py') exec(open(activate_this).read(), dict(__file__=activate_this)) """ import os, traceback from bs4 import BeautifulSoup import random, time, datetime import requests from requests.exceptions import ProxyError from urllib.parse import urlparse, urljoin import tldextract #декларативное определение from sqlalchemy import Column, Integer, String, Text, create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker #---------------------------------- Variables ---------- donors = [ "http://npedkol.ru/", #"https://it-events.com/", #"https://habr.com/ru/all/", #"https://innopolis.university/", ] keywords = [ "олимпиада", "хакатон", "конкурс" ] user_agents = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0", "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)", "Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", "Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.16", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 YaBrowser/1.7.1364.21027 Safari/537.22", "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16", "Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25", "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.15", "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10.14; rv: 75.0) Gecko / 20100101 Firefox / 75.0", "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; rv: 74.0) Gecko / 20100101 Firefox / 74.0", "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / 80.0.3987.163 Safari / 537.36", "Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)" ] bad_file_extensions = ["pdf", "doc", "docx"] #---------------------------------- Variables End ---------- out_s = "" #Инициализация SQLLite basedir = os.path.abspath(os.path.dirname(__file__)) SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, 'all.db') engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True) Base = declarative_base() # класс БД собранной информации class Links(Base): __tablename__ = 'links' id = Column(Integer, primary_key=True, autoincrement=True) donor = Column(String(255)) title = Column(String(512)) href = Column(String(512)) parse_date = Column(Integer) html = Column(Text) text = Column(Text) lemmas = Column(Text) # набор лемм из текста (мешок слов) level = Column(Integer) # уровень вложенности ссылки от корня сайта status = Column(Integer) # 0 - не загружена, 1 - загружена, ключевика нет; 2 - ключевик есть, уведомление не отправлено; 3 - уведомление не отправлено def __init__(self, donor, title, href, parse_date, html, text, lemmas, level, status): self.donor = donor self.title = title self.href = href self.parse_date = parse_date self.html = html self.text = text self.lemmas = lemmas self.level = level self.status = status def __repr__(self): return "" % (self.title, self.href) class Log(Base): __tablename__ = 'log' id = Column(Integer, primary_key=True, autoincrement=True) action = Column(String(64)) status = Column(String(64)) time = Column(Integer) donor = Column(String(64)) def __init__(self, action, status, time, donor): self.action = action self.status = status self.time = time self.donor = donor def __repr__(self): return "" % (self.action, self.status) # Создание таблицы Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) sqllite_session = Session() def select_good_link(link_href, donor): is_bad = False donor_parsed = urlparse(donor) link_parsed = urlparse(link_href) # приводим ссылку к каноничному виду if (link_parsed.hostname == None): link_2 = donor_parsed.scheme + "://" + donor_parsed.hostname link_2 = urljoin(link_2, link_href) else: link_2 = link_href # нас интересуют только внутренние ссылки (и с поддоменов тоже) donor_domain = tldextract.extract(donor).domain link_domain = tldextract.extract(link_2).domain print("link_domain: ", link_domain) if (link_domain != donor_domain): print("Внешняя ссылка, пропускаем") is_bad = True # убираем .pdf, mailto и пр. pos = str(link_2).find("mailto") if (pos != -1): print("mailto, пропускаем") is_bad = True filename, file_extension = os.path.splitext(link_2) if (file_extension in bad_file_extensions): print(file_extension, ", пропускаем") is_bad = True # GET переменные не нужны try: link_2, get = str(link_2).split("?") except: pass return is_bad, link_2 for donor in donors: print("Парсим ", donor) #формируем запрос user_agent = random.choice(user_agents) donor_parsed = urlparse(donor) headers = { "Host": str(donor_parsed.hostname), 'User-Agent': str(user_agent), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': str(donor), 'Upgrade-Insecure-Requests': '1', 'Connection': 'keep-alive'} response = requests.get(donor, headers=headers) post_html = response.text with open(os.path.join(basedir, donor_parsed.hostname+'.html'), 'w', encoding="utf-8") as f: f.write(post_html) # Парсим ссылки soup = BeautifulSoup(post_html, "lxml") all_links = soup.find_all('a') for link in all_links: link_href = link.get('href') print(link_href) is_bad, link_2 = select_good_link(link_href, donor) if (is_bad): print() continue print("link_2: ", link_2) #уникальность ссылки print("Проверяем уникальность") link_n = sqllite_session.query(Links).filter(Links.href == link_2).count() print("link_n: " + str(link_n)) if (link_n == 0): print("Добавляем в базу") new_link = Links( title = "", href = link_href, donor = donor, parse_date = int(time.time()), html = "", text = "", lemmas = "", level = 1, status = 0 ) sqllite_session.add(new_link) else: print("В базе ссылка есть") print () """ new_log = Log( action = "parse", status = status, time = int(time.time()), donor = 'habr.ru', ) sqllite_session.add(new_log) """ sqllite_session.commit()