123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- # -*- encoding: utf-8 -*-
- #
- """
- virtual_env = os.path.expanduser('~/projects/world-it-planet/env')
- activate_this = os.path.join(virtual_env, 'bin/activate_this.py')
- exec(open(activate_this).read(), dict(__file__=activate_this))
- """
- import os, traceback
- from bs4 import BeautifulSoup
- import random, time, datetime
- import requests
- from requests.exceptions import ProxyError
- from urllib.parse import urlparse, urljoin
- import tldextract
- #декларативное определение
- from sqlalchemy import Column, Integer, String, Text, create_engine
- from sqlalchemy.ext.declarative import declarative_base
- from sqlalchemy.orm import sessionmaker
- #---------------------------------- Variables ----------
- donors = [
- "http://npedkol.ru/",
- #"https://it-events.com/",
- #"https://habr.com/ru/all/",
- #"https://innopolis.university/",
- ]
- keywords = [
- "олимпиада",
- "хакатон",
- "конкурс"
- ]
- user_agents = [
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
- "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0",
- "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
- "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
- "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
- "Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0",
- "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
- "Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.16",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 YaBrowser/1.7.1364.21027 Safari/537.22",
- "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16",
- "Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25",
- "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.15",
- "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10.14; rv: 75.0) Gecko / 20100101 Firefox / 75.0",
- "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; rv: 74.0) Gecko / 20100101 Firefox / 74.0",
- "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / 80.0.3987.163 Safari / 537.36",
- "Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)"
- ]
- bad_file_extensions = ["pdf", "doc", "docx"]
- #---------------------------------- Variables End ----------
- out_s = ""
- #Инициализация SQLLite
- basedir = os.path.abspath(os.path.dirname(__file__))
- SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, 'all.db')
- engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True)
- Base = declarative_base()
- # класс БД собранной информации
- class Links(Base):
- __tablename__ = 'links'
- id = Column(Integer, primary_key=True, autoincrement=True)
- donor = Column(String(255))
- title = Column(String(512))
- href = Column(String(512))
- parse_date = Column(Integer)
- html = Column(Text)
- text = Column(Text)
- lemmas = Column(Text) # набор лемм из текста (мешок слов)
- level = Column(Integer) # уровень вложенности ссылки от корня сайта
- status = Column(Integer) # 0 - не загружена, 1 - загружена, ключевика нет; 2 - ключевик есть, уведомление не отправлено; 3 - уведомление не отправлено
- def __init__(self, donor, title, href, parse_date, html, text, lemmas, level, status):
- self.donor = donor
- self.title = title
- self.href = href
- self.parse_date = parse_date
- self.html = html
- self.text = text
- self.lemmas = lemmas
- self.level = level
- self.status = status
- def __repr__(self):
- return "<Link('%s', '%s')>" % (self.title, self.href)
- class Log(Base):
- __tablename__ = 'log'
- id = Column(Integer, primary_key=True, autoincrement=True)
- action = Column(String(64))
- status = Column(String(64))
- time = Column(Integer)
- donor = Column(String(64))
- def __init__(self, action, status, time, donor):
- self.action = action
- self.status = status
- self.time = time
- self.donor = donor
- def __repr__(self):
- return "<Log('%s','%s', '%s')>" % (self.action, self.status)
- # Создание таблицы
- Base.metadata.create_all(engine)
- Session = sessionmaker(bind=engine)
- sqllite_session = Session()
- def select_good_link(link_href, donor):
- is_bad = False
- donor_parsed = urlparse(donor)
- link_parsed = urlparse(link_href)
- # приводим ссылку к каноничному виду
- if (link_parsed.hostname == None):
- link_2 = donor_parsed.scheme + "://" + donor_parsed.hostname
- link_2 = urljoin(link_2, link_href)
- else:
- link_2 = link_href
- # нас интересуют только внутренние ссылки (и с поддоменов тоже)
- donor_domain = tldextract.extract(donor).domain
- link_domain = tldextract.extract(link_2).domain
- print("link_domain: ", link_domain)
- if (link_domain != donor_domain):
- print("Внешняя ссылка, пропускаем")
- is_bad = True
- # убираем .pdf, mailto и пр.
- pos = str(link_2).find("mailto")
- if (pos != -1):
- print("mailto, пропускаем")
- is_bad = True
- filename, file_extension = os.path.splitext(link_2)
- if (file_extension in bad_file_extensions):
- print(file_extension, ", пропускаем")
- is_bad = True
- # GET переменные не нужны
- try:
- link_2, get = str(link_2).split("?")
- except:
- pass
- return is_bad, link_2
- for donor in donors:
- print("Парсим ", donor)
- #формируем запрос
- user_agent = random.choice(user_agents)
- donor_parsed = urlparse(donor)
-
- headers = {
- "Host": str(donor_parsed.hostname),
- 'User-Agent': str(user_agent),
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
- 'Accept-Encoding': 'gzip, deflate, br',
- 'Referer': str(donor),
- 'Upgrade-Insecure-Requests': '1',
- 'Connection': 'keep-alive'}
- response = requests.get(donor, headers=headers)
- post_html = response.text
- with open(os.path.join(basedir, donor_parsed.hostname+'.html'), 'w', encoding="utf-8") as f:
- f.write(post_html)
- # Парсим ссылки
- soup = BeautifulSoup(post_html, "lxml")
- all_links = soup.find_all('a')
- for link in all_links:
- link_href = link.get('href')
- print(link_href)
- is_bad, link_2 = select_good_link(link_href, donor)
- if (is_bad):
- print()
- continue
- print("link_2: ", link_2)
- #уникальность ссылки
- print("Проверяем уникальность")
- link_n = sqllite_session.query(Links).filter(Links.href == link_2).count()
- print("link_n: " + str(link_n))
- if (link_n == 0):
- print("Добавляем в базу")
- new_link = Links(
- title = "",
- href = link_href,
- donor = donor,
- parse_date = int(time.time()),
- html = "",
- text = "",
- lemmas = "",
- level = 1,
- status = 0
- )
- sqllite_session.add(new_link)
- else:
- print("В базе ссылка есть")
-
- print ()
-
- """
- new_log = Log(
- action = "parse",
- status = status,
- time = int(time.time()),
- donor = 'habr.ru',
- )
- sqllite_session.add(new_log)
- """
- sqllite_session.commit()
|