crauler.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. # -*- encoding: utf-8 -*-
  2. #
  3. """
  4. virtual_env = os.path.expanduser('~/projects/world-it-planet/env')
  5. activate_this = os.path.join(virtual_env, 'bin/activate_this.py')
  6. exec(open(activate_this).read(), dict(__file__=activate_this))
  7. """
  8. import os, urllib.parse, traceback
  9. from bs4 import BeautifulSoup
  10. import random, time, datetime
  11. import requests
  12. from requests.exceptions import ProxyError
  13. #декларативное определение
  14. from sqlalchemy import Column, Integer, String, Text, create_engine
  15. from sqlalchemy.ext.declarative import declarative_base
  16. from sqlalchemy.orm import sessionmaker
  17. #---------------------------------- Variables ----------
  18. donors = [
  19. "https://it-events.com/",
  20. "https://habr.com/ru/all/",
  21. "http://npedkol.ru/",
  22. "https://innopolis.university/",
  23. ]
  24. keywords = [
  25. "олимпиада",
  26. "хакатон",
  27. "конкурс"
  28. ]
  29. user_agents = [
  30. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0",
  31. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
  32. "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  33. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  34. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
  35. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
  36. "Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  37. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  38. "Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.16",
  39. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  40. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 YaBrowser/1.7.1364.21027 Safari/537.22",
  41. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16",
  42. "Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25",
  43. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.15",
  44. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10.14; rv: 75.0) Gecko / 20100101 Firefox / 75.0",
  45. "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; rv: 74.0) Gecko / 20100101 Firefox / 74.0",
  46. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / 80.0.3987.163 Safari / 537.36",
  47. "Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)"
  48. ]
  49. #---------------------------------- Variables End ----------
  50. out_s = ""
  51. #Инициализация SQLLite
  52. basedir = os.path.abspath(os.path.dirname(__file__))
  53. SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, 'all.db')
  54. engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True)
  55. Base = declarative_base()
  56. class Links(Base):
  57. __tablename__ = 'links'
  58. id = Column(Integer, primary_key=True, autoincrement=True)
  59. title = Column(String(512))
  60. href = Column(String(512))
  61. donor = Column(String(255))
  62. donor_link_id = Column(Integer) #внутренний идентификатор для донора, https://habr.com/ru/company/skillfactory/blog/578014/ -> 578014
  63. parse_date = Column(Integer)
  64. text = Column(Text)
  65. def __init__(self, title, href, donor, donor_link_id, parse_date, text):
  66. self.title = title
  67. self.href = href
  68. self.donor = donor
  69. self.donor_link_id = donor_link_id
  70. self.parse_date = parse_date
  71. self.text = text
  72. def __repr__(self):
  73. return "<Link('%s', '%s')>" % (self.title, self.href)
  74. class Log(Base):
  75. __tablename__ = 'log'
  76. id = Column(Integer, primary_key=True, autoincrement=True)
  77. action = Column(String(64))
  78. status = Column(String(64))
  79. time = Column(Integer)
  80. donor = Column(String(64))
  81. def __init__(self, action, status, time, donor):
  82. self.action = action
  83. self.status = status
  84. self.time = time
  85. self.donor = donor
  86. def __repr__(self):
  87. return "<Log('%s','%s', '%s')>" % (self.action, self.status)
  88. # Создание таблицы
  89. Base.metadata.create_all(engine)
  90. Session = sessionmaker(bind=engine)
  91. sqllite_session = Session()
  92. #"""
  93. #формируем запрос
  94. user_agent = random.choice(user_agents)
  95. url = "https://habr.com/ru/all/"
  96. referer = "https://habr.com/ru/all/"
  97. headers = {
  98. "Host": str("habr.com"),
  99. 'User-Agent': str(user_agent),
  100. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  101. 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
  102. 'Accept-Encoding': 'gzip, deflate, br',
  103. 'Referer': str(referer),
  104. 'Upgrade-Insecure-Requests': '1',
  105. 'Connection': 'keep-alive'}
  106. response = requests.get(url, headers=headers)
  107. post_html = response.text
  108. with open(os.path.join(basedir, 'habr.html'), 'w', encoding="utf-8") as f:
  109. f.write(post_html)
  110. #Парсим ссылки habr.com
  111. soup = BeautifulSoup(post_html, "lxml")
  112. vacancies = soup.find_all('div', {'class': 'tm-article-snippet'})
  113. for vacancy in vacancies:
  114. try:
  115. link_title = str(vacancy.find('h2', {'class': 'tm-article-snippet__title_h2'}).text).strip()
  116. link_href = str(vacancy.find('a', {'class': 'tm-article-snippet__title-link'}).get('href')).strip()
  117. link_href = "https://habr.com" + link_href
  118. #donor_link_id - предпоследнее число из link_href
  119. donor_link_ids = link_href.split("/")
  120. donor_link_id = donor_link_ids[len(donor_link_ids)-2]
  121. out_s += str(link_title) + "<br>"
  122. out_s += str(link_href + " :: " + str(donor_link_id)) + "<br>"
  123. #уникальность ссылки
  124. out_s += "Проверяем уникальность\n" + "<br>"
  125. link_n = sqllite_session.query(Links).filter(Links.donor_link_id == donor_link_id).count()
  126. out_s += "link_n: " + str(link_n) + "<br>"
  127. if (link_n == 0):
  128. out_s += "Добавляем в базу" + "<br>"
  129. new_link = Links(
  130. title = link_title,
  131. href = link_href,
  132. donor = 'habr.ru',
  133. donor_link_id = donor_link_id,
  134. parse_date = int(time.time()),
  135. text = ""
  136. )
  137. sqllite_session.add(new_link)
  138. else:
  139. out_s += "В базе ссылка есть" + "<br>"
  140. out_s += "<br>"
  141. status = 'Ok'
  142. except Exception as e:
  143. out_s += str(traceback.format_exc())
  144. status = str(traceback.format_exc())
  145. new_log = Log(
  146. action = "parse",
  147. status = status,
  148. time = int(time.time()),
  149. donor = 'habr.ru',
  150. )
  151. sqllite_session.add(new_log)
  152. sqllite_session.commit()
  153. out_s += "<br>success"
  154. b = out_s.encode('utf-8')