parser.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. # -*- encoding: utf-8 -*-
  2. #
  3. """
  4. virtual_env = os.path.expanduser('~/projects/world-it-planet/env')
  5. activate_this = os.path.join(virtual_env, 'bin/activate_this.py')
  6. exec(open(activate_this).read(), dict(__file__=activate_this))
  7. """
  8. import os, traceback
  9. from bs4 import BeautifulSoup
  10. import random, time, datetime
  11. import requests
  12. from requests.exceptions import ProxyError
  13. from urllib.parse import urlparse, urljoin
  14. import tldextract
  15. # download stopwords corpus, you need to run it once
  16. import nltk
  17. nltk.download("stopwords")
  18. #--------#
  19. from nltk.corpus import stopwords
  20. from pymystem3 import Mystem
  21. from string import punctuation
  22. #декларативное определение
  23. from sqlalchemy import Column, Integer, String, Text, create_engine
  24. from sqlalchemy.ext.declarative import declarative_base
  25. from sqlalchemy.orm import sessionmaker
  26. from sqlalchemy.sql.expression import func
  27. #---------------------------------- Variables ----------
  28. donors = [
  29. "http://npedkol.ru/",
  30. "https://it-events.com/",
  31. "https://habr.com/ru/all/",
  32. "https://innopolis.university/",
  33. ]
  34. keywords = [
  35. "олимпиада",
  36. "хакатон",
  37. "конкурс"
  38. ]
  39. user_agents = [
  40. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0",
  41. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
  42. "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  43. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  44. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
  45. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
  46. "Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  47. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  48. "Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.16",
  49. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  50. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 YaBrowser/1.7.1364.21027 Safari/537.22",
  51. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16",
  52. "Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25",
  53. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.15",
  54. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10.14; rv: 75.0) Gecko / 20100101 Firefox / 75.0",
  55. "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; rv: 74.0) Gecko / 20100101 Firefox / 74.0",
  56. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / 80.0.3987.163 Safari / 537.36",
  57. "Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)"
  58. ]
  59. bad_file_extensions = ["pdf", "doc", "docx"]
  60. #---------------------------------- Variables End ----------
  61. out_s = ""
  62. #Инициализация SQLLite
  63. basedir = os.path.abspath(os.path.dirname(__file__))
  64. SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, 'all.db')
  65. engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True)
  66. Base = declarative_base()
  67. # класс БД собранной информации
  68. class Links(Base):
  69. __tablename__ = 'links'
  70. id = Column(Integer, primary_key=True, autoincrement=True)
  71. donor = Column(String(255))
  72. title = Column(String(512))
  73. href = Column(String(512))
  74. parse_date = Column(Integer)
  75. html = Column(Text)
  76. text = Column(Text)
  77. lemmas = Column(Text) # набор лемм из текста (мешок слов)
  78. level = Column(Integer) # уровень вложенности ссылки от корня сайта
  79. status = Column(Integer) # 0 - не загружена, 1 - загружена, ключевика нет; 2 - ключевик есть, уведомление не отправлено; 3 - уведомление не отправлено
  80. def __init__(self, donor, title, href, parse_date, html, text, lemmas, level, status):
  81. self.donor = donor
  82. self.title = title
  83. self.href = href
  84. self.parse_date = parse_date
  85. self.html = html
  86. self.text = text
  87. self.lemmas = lemmas
  88. self.level = level
  89. self.status = status
  90. def __repr__(self):
  91. return "<Link('%s', '%s')>" % (self.title, self.href)
  92. class Log(Base):
  93. __tablename__ = 'log'
  94. id = Column(Integer, primary_key=True, autoincrement=True)
  95. action = Column(String(64))
  96. status = Column(String(64))
  97. time = Column(Integer)
  98. donor = Column(String(64))
  99. def __init__(self, action, status, time, donor):
  100. self.action = action
  101. self.status = status
  102. self.time = time
  103. self.donor = donor
  104. def __repr__(self):
  105. return "<Log('%s','%s', '%s')>" % (self.action, self.status)
  106. # Создание таблицы
  107. Base.metadata.create_all(engine)
  108. Session = sessionmaker(bind=engine)
  109. sqllite_session = Session()
  110. def select_good_link(link_href, donor):
  111. is_bad = False
  112. donor_parsed = urlparse(donor)
  113. link_parsed = urlparse(link_href)
  114. # приводим ссылку к каноничному виду
  115. if (link_parsed.hostname == None):
  116. link_2 = donor_parsed.scheme + "://" + donor_parsed.hostname
  117. link_2 = urljoin(link_2, link_href)
  118. else:
  119. link_2 = link_href
  120. # нас интересуют только внутренние ссылки (и с поддоменов тоже)
  121. donor_domain = tldextract.extract(donor).domain
  122. link_domain = tldextract.extract(link_2).domain
  123. print("link_domain: ", link_domain)
  124. if (link_domain != donor_domain):
  125. print("Внешняя ссылка, пропускаем")
  126. is_bad = True
  127. # убираем .pdf, mailto и пр.
  128. pos = str(link_2).find("mailto")
  129. if (pos != -1):
  130. print("mailto, пропускаем")
  131. is_bad = True
  132. # GET переменные не нужны
  133. try:
  134. link_2, get = str(link_2).split("?")
  135. except:
  136. pass
  137. filename, file_extension = os.path.splitext(link_2)
  138. if (file_extension in bad_file_extensions):
  139. print(file_extension, ", пропускаем")
  140. is_bad = True
  141. # сам домен тоже не нужен
  142. if (link_2 == donor):
  143. print("Главная страница, пропускаем")
  144. is_bad = True
  145. return is_bad, link_2
  146. # парсим по 10 ссылок за раз
  147. for i in range(0, 10):
  148. link = sqllite_session.query(Links).filter(Links.status == 0).order_by(func.random()).first()
  149. print("Парсим ", link.href)
  150. #формируем запрос
  151. user_agent = random.choice(user_agents)
  152. donor_parsed = urlparse(link.href)
  153. headers = {
  154. "Host": str(donor_parsed.hostname),
  155. 'User-Agent': str(user_agent),
  156. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  157. 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
  158. 'Accept-Encoding': 'gzip, deflate, br',
  159. 'Referer': str(link.href),
  160. 'Upgrade-Insecure-Requests': '1',
  161. 'Connection': 'keep-alive'}
  162. try:
  163. response = requests.get(link.href, headers=headers)
  164. print("status_code: ", response.status_code)
  165. if (response.status_code == 200):
  166. link.html = response.text
  167. soup = BeautifulSoup(response.text, "lxml")
  168. title = str(soup.find('title').string.strip())
  169. print("title: ", title)
  170. link.title = title
  171. text = soup.get_text()
  172. link.text = text
  173. except:
  174. print("Ошибка, пропускаем")
  175. print()