crauler.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. # -*- encoding: utf-8 -*-
  2. #
  3. """
  4. virtual_env = os.path.expanduser('~/projects/world-it-planet/env')
  5. activate_this = os.path.join(virtual_env, 'bin/activate_this.py')
  6. exec(open(activate_this).read(), dict(__file__=activate_this))
  7. """
  8. import os, traceback
  9. from bs4 import BeautifulSoup
  10. import random, time, datetime
  11. import requests
  12. from requests.exceptions import ProxyError
  13. from urllib.parse import urlparse, urljoin
  14. import tldextract
  15. #декларативное определение
  16. from sqlalchemy import Column, Integer, String, Text, create_engine
  17. from sqlalchemy.ext.declarative import declarative_base
  18. from sqlalchemy.orm import sessionmaker
  19. #---------------------------------- Variables ----------
  20. donors = [
  21. "http://npedkol.ru/",
  22. #"https://it-events.com/",
  23. #"https://habr.com/ru/all/",
  24. #"https://innopolis.university/",
  25. ]
  26. keywords = [
  27. "олимпиада",
  28. "хакатон",
  29. "конкурс"
  30. ]
  31. user_agents = [
  32. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0",
  33. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
  34. "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  35. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  36. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
  37. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
  38. "Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  39. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  40. "Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.16",
  41. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  42. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 YaBrowser/1.7.1364.21027 Safari/537.22",
  43. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16",
  44. "Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25",
  45. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.15",
  46. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10.14; rv: 75.0) Gecko / 20100101 Firefox / 75.0",
  47. "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; rv: 74.0) Gecko / 20100101 Firefox / 74.0",
  48. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / 80.0.3987.163 Safari / 537.36",
  49. "Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)"
  50. ]
  51. bad_file_extensions = ["pdf", "doc", "docx"]
  52. #---------------------------------- Variables End ----------
  53. out_s = ""
  54. #Инициализация SQLLite
  55. basedir = os.path.abspath(os.path.dirname(__file__))
  56. SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, 'all.db')
  57. engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True)
  58. Base = declarative_base()
  59. # класс БД собранной информации
  60. class Links(Base):
  61. __tablename__ = 'links'
  62. id = Column(Integer, primary_key=True, autoincrement=True)
  63. donor = Column(String(255))
  64. title = Column(String(512))
  65. href = Column(String(512))
  66. parse_date = Column(Integer)
  67. html = Column(Text)
  68. text = Column(Text)
  69. lemmas = Column(Text) # набор лемм из текста (мешок слов)
  70. level = Column(Integer) # уровень вложенности ссылки от корня сайта
  71. status = Column(Integer) # 0 - не загружена, 1 - загружена, ключевика нет; 2 - ключевик есть, уведомление не отправлено; 3 - уведомление не отправлено
  72. def __init__(self, donor, title, href, parse_date, html, text, lemmas, level, status):
  73. self.donor = donor
  74. self.title = title
  75. self.href = href
  76. self.parse_date = parse_date
  77. self.html = html
  78. self.text = text
  79. self.lemmas = lemmas
  80. self.level = level
  81. self.status = status
  82. def __repr__(self):
  83. return "<Link('%s', '%s')>" % (self.title, self.href)
  84. class Log(Base):
  85. __tablename__ = 'log'
  86. id = Column(Integer, primary_key=True, autoincrement=True)
  87. action = Column(String(64))
  88. status = Column(String(64))
  89. time = Column(Integer)
  90. donor = Column(String(64))
  91. def __init__(self, action, status, time, donor):
  92. self.action = action
  93. self.status = status
  94. self.time = time
  95. self.donor = donor
  96. def __repr__(self):
  97. return "<Log('%s','%s', '%s')>" % (self.action, self.status)
  98. # Создание таблицы
  99. Base.metadata.create_all(engine)
  100. Session = sessionmaker(bind=engine)
  101. sqllite_session = Session()
  102. def select_good_link(link_href, donor):
  103. is_bad = False
  104. donor_parsed = urlparse(donor)
  105. link_parsed = urlparse(link_href)
  106. # приводим ссылку к каноничному виду
  107. if (link_parsed.hostname == None):
  108. link_2 = donor_parsed.scheme + "://" + donor_parsed.hostname
  109. link_2 = urljoin(link_2, link_href)
  110. else:
  111. link_2 = link_href
  112. # нас интересуют только внутренние ссылки (и с поддоменов тоже)
  113. donor_domain = tldextract.extract(donor).domain
  114. link_domain = tldextract.extract(link_2).domain
  115. print("link_domain: ", link_domain)
  116. if (link_domain != donor_domain):
  117. print("Внешняя ссылка, пропускаем")
  118. is_bad = True
  119. # убираем .pdf, mailto и пр.
  120. pos = str(link_2).find("mailto")
  121. if (pos != -1):
  122. print("mailto, пропускаем")
  123. is_bad = True
  124. filename, file_extension = os.path.splitext(link_2)
  125. if (file_extension in bad_file_extensions):
  126. print(file_extension, ", пропускаем")
  127. is_bad = True
  128. # GET переменные не нужны
  129. try:
  130. link_2, get = str(link_2).split("?")
  131. except:
  132. pass
  133. return is_bad, link_2
  134. for donor in donors:
  135. print("Парсим ", donor)
  136. #формируем запрос
  137. user_agent = random.choice(user_agents)
  138. donor_parsed = urlparse(donor)
  139. headers = {
  140. "Host": str(donor_parsed.hostname),
  141. 'User-Agent': str(user_agent),
  142. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  143. 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
  144. 'Accept-Encoding': 'gzip, deflate, br',
  145. 'Referer': str(donor),
  146. 'Upgrade-Insecure-Requests': '1',
  147. 'Connection': 'keep-alive'}
  148. response = requests.get(donor, headers=headers)
  149. post_html = response.text
  150. with open(os.path.join(basedir, donor_parsed.hostname+'.html'), 'w', encoding="utf-8") as f:
  151. f.write(post_html)
  152. # Парсим ссылки
  153. soup = BeautifulSoup(post_html, "lxml")
  154. all_links = soup.find_all('a')
  155. for link in all_links:
  156. link_href = link.get('href')
  157. print(link_href)
  158. is_bad, link_2 = select_good_link(link_href, donor)
  159. if (is_bad):
  160. print()
  161. continue
  162. print("link_2: ", link_2)
  163. #уникальность ссылки
  164. print("Проверяем уникальность")
  165. link_n = sqllite_session.query(Links).filter(Links.href == link_2).count()
  166. print("link_n: " + str(link_n))
  167. if (link_n == 0):
  168. print("Добавляем в базу")
  169. new_link = Links(
  170. title = "",
  171. href = link_href,
  172. donor = donor,
  173. parse_date = int(time.time()),
  174. html = "",
  175. text = "",
  176. lemmas = "",
  177. level = 1,
  178. status = 0
  179. )
  180. sqllite_session.add(new_link)
  181. else:
  182. print("В базе ссылка есть")
  183. print ()
  184. """
  185. new_log = Log(
  186. action = "parse",
  187. status = status,
  188. time = int(time.time()),
  189. donor = 'habr.ru',
  190. )
  191. sqllite_session.add(new_log)
  192. """
  193. sqllite_session.commit()