habr_parser.wsgi 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. # -*- encoding: utf-8 -*-
  2. #
  3. import os, urllib.parse, traceback
  4. virtual_env = os.path.expanduser('~/projects/world-it-planet/env')
  5. activate_this = os.path.join(virtual_env, 'bin/activate_this.py')
  6. exec(open(activate_this).read(), dict(__file__=activate_this))
  7. from bs4 import BeautifulSoup
  8. import random, time, datetime
  9. import requests
  10. from requests.exceptions import ProxyError
  11. #декларативное определение
  12. from sqlalchemy import Column, Integer, String, Text, create_engine
  13. from sqlalchemy.ext.declarative import declarative_base
  14. from sqlalchemy.orm import sessionmaker
  15. #---------------------------------- Variables ----------
  16. user_agents = [
  17. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0",
  18. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
  19. "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  20. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  21. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
  22. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
  23. "Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  24. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  25. "Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.16",
  26. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  27. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 YaBrowser/1.7.1364.21027 Safari/537.22",
  28. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16",
  29. "Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25",
  30. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.15",
  31. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10.14; rv: 75.0) Gecko / 20100101 Firefox / 75.0",
  32. "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; rv: 74.0) Gecko / 20100101 Firefox / 74.0",
  33. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / 80.0.3987.163 Safari / 537.36",
  34. "Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)"
  35. ]
  36. #---------------------------------- Variables End ----------
  37. def application(env, start_response):
  38. out_s = ""
  39. #Инициализация SQLLite
  40. basedir = os.path.abspath(os.path.dirname(__file__))
  41. SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, 'habr.db')
  42. engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True)
  43. Base = declarative_base()
  44. class Links(Base):
  45. __tablename__ = 'links'
  46. id = Column(Integer, primary_key=True, autoincrement=True)
  47. title = Column(String(512))
  48. href = Column(String(512))
  49. donor = Column(String(255))
  50. donor_link_id = Column(Integer) #внутренний идентификатор для донора, https://habr.com/ru/company/skillfactory/blog/578014/ -> 578014
  51. parse_date = Column(Integer)
  52. text = Column(Text)
  53. def __init__(self, title, href, donor, donor_link_id, parse_date, text):
  54. self.title = title
  55. self.href = href
  56. self.donor = donor
  57. self.donor_link_id = donor_link_id
  58. self.parse_date = parse_date
  59. self.text = text
  60. def __repr__(self):
  61. return "<Link('%s', '%s')>" % (self.title, self.href)
  62. class Log(Base):
  63. __tablename__ = 'log'
  64. id = Column(Integer, primary_key=True, autoincrement=True)
  65. action = Column(String(64))
  66. status = Column(String(64))
  67. time = Column(Integer)
  68. donor = Column(String(64))
  69. def __init__(self, action, status, time, donor):
  70. self.action = action
  71. self.status = status
  72. self.time = time
  73. self.donor = donor
  74. def __repr__(self):
  75. return "<Log('%s','%s', '%s')>" % (self.action, self.status)
  76. # Создание таблицы
  77. Base.metadata.create_all(engine)
  78. Session = sessionmaker(bind=engine)
  79. sqllite_session = Session()
  80. #"""
  81. #формируем запрос
  82. user_agent = random.choice(user_agents)
  83. url = "https://habr.com/ru/all/"
  84. referer = "https://habr.com/ru/all/"
  85. headers = {
  86. "Host": str("habr.com"),
  87. 'User-Agent': str(user_agent),
  88. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  89. 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
  90. 'Accept-Encoding': 'gzip, deflate, br',
  91. 'Referer': str(referer),
  92. 'Upgrade-Insecure-Requests': '1',
  93. 'Connection': 'keep-alive'}
  94. response = requests.get(url, headers=headers)
  95. post_html = response.text
  96. with open(os.path.join(basedir, 'habr.html'), 'w', encoding="utf-8") as f:
  97. f.write(post_html)
  98. #Парсим ссылки habr.com
  99. soup = BeautifulSoup(post_html, "lxml")
  100. vacancies = soup.find_all('div', {'class': 'tm-article-snippet'})
  101. for vacancy in vacancies:
  102. try:
  103. link_title = str(vacancy.find('h2', {'class': 'tm-article-snippet__title_h2'}).text).strip()
  104. link_href = str(vacancy.find('a', {'class': 'tm-article-snippet__title-link'}).get('href')).strip()
  105. link_href = "https://habr.com" + link_href
  106. #donor_link_id - предпоследнее число из link_href
  107. donor_link_ids = link_href.split("/")
  108. donor_link_id = donor_link_ids[len(donor_link_ids)-2]
  109. out_s += str(link_title) + "<br>"
  110. out_s += str(link_href + " :: " + str(donor_link_id)) + "<br>"
  111. #уникальность ссылки
  112. out_s += "Проверяем уникальность\n" + "<br>"
  113. link_n = sqllite_session.query(Links).filter(Links.donor_link_id == donor_link_id).count()
  114. out_s += "link_n: " + str(link_n) + "<br>"
  115. if (link_n == 0):
  116. out_s += "Добавляем в базу" + "<br>"
  117. new_link = Links(
  118. title = link_title,
  119. href = link_href,
  120. donor = 'habr.ru',
  121. donor_link_id = donor_link_id,
  122. parse_date = int(time.time()),
  123. text = ""
  124. )
  125. sqllite_session.add(new_link)
  126. else:
  127. out_s += "В базе ссылка есть" + "<br>"
  128. out_s += "<br>"
  129. status = 'Ok'
  130. except Exception as e:
  131. out_s += str(traceback.format_exc())
  132. status = str(traceback.format_exc())
  133. new_log = Log(
  134. action = "parse",
  135. status = status,
  136. time = int(time.time()),
  137. donor = 'habr.ru',
  138. )
  139. sqllite_session.add(new_log)
  140. sqllite_session.commit()
  141. out_s += "<br>success"
  142. start_response('200 OK', [('Content-Type','text/html')])
  143. b = out_s.encode('utf-8')
  144. return [b]