hh_parser.wsgi 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. # -*- encoding: utf-8 -*-
  2. #
  3. import os, urllib.parse, traceback
  4. virtual_env = os.path.expanduser('~/projects/world-it-planet/env')
  5. activate_this = os.path.join(virtual_env, 'bin/activate_this.py')
  6. exec(open(activate_this).read(), dict(__file__=activate_this))
  7. from bs4 import BeautifulSoup
  8. import random, time, datetime
  9. import requests
  10. from requests.exceptions import ProxyError
  11. #декларативное определение
  12. from sqlalchemy import Column, Integer, String, create_engine
  13. from sqlalchemy.ext.declarative import declarative_base
  14. from sqlalchemy.orm import sessionmaker
  15. #---------------------------------- Variables ----------
  16. areas = {"bryansk":19}
  17. specializations = ['программист', 'стажер', 'стажировка']
  18. proxies = [
  19. {
  20. 'http': 'http://linbergsergey_gmail_:56c2134eac@212.81.34.171:30013',
  21. 'https': 'http://linbergsergey_gmail_:56c2134eac@212.81.34.171:30013',
  22. },
  23. {
  24. 'http': 'http://linbergsergey_gmail_:56c2134eac@212.81.33.230:30013',
  25. 'https': 'http://linbergsergey_gmail_:56c2134eac@212.81.33.230:30013',
  26. },
  27. {},
  28. ]
  29. user_agents = [
  30. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0",
  31. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
  32. "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  33. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  34. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
  35. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
  36. "Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  37. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  38. "Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.16",
  39. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  40. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 YaBrowser/1.7.1364.21027 Safari/537.22",
  41. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16",
  42. "Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25",
  43. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.15",
  44. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10.14; rv: 75.0) Gecko / 20100101 Firefox / 75.0",
  45. "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; rv: 74.0) Gecko / 20100101 Firefox / 74.0",
  46. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / 80.0.3987.163 Safari / 537.36",
  47. "Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)"
  48. ]
  49. #---------------------------------- Variables End ----------
  50. def application(env, start_response):
  51. out_s = ""
  52. #Инициализация SQLLite
  53. basedir = os.path.abspath(os.path.dirname(__file__))
  54. SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, 'hhtm.db')
  55. engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True)
  56. Base = declarative_base()
  57. class Vacancies(Base):
  58. __tablename__ = 'vacancies'
  59. id = Column(Integer, primary_key=True, autoincrement=True)
  60. title = Column(String(512))
  61. city = Column(String(20))
  62. specialization = Column(String(255))
  63. href = Column(String(512))
  64. donor = Column(String(255))
  65. vacancy_id = Column(Integer)
  66. vacancy_date = Column(Integer)
  67. parse_date = Column(Integer)
  68. employer = Column(String(255))
  69. canal_city_id = Column(Integer)
  70. canal_city_date = Column(Integer)
  71. canal_spec_id = Column(Integer)
  72. canal_spec_date = Column(Integer)
  73. def __init__(self, title, city, specialization, href, donor, vacancy_id, vacancy_date, parse_date, employer, canal_city_id, canal_city_date, canal_spec_id, canal_spec_date):
  74. self.title = title
  75. self.city = city
  76. self.specialization = specialization
  77. self.href = href
  78. self.donor = donor
  79. self.vacancy_id = vacancy_id
  80. self.vacancy_date = vacancy_date
  81. self.parse_date = parse_date
  82. self.employer = employer
  83. self.canal_city_id = canal_city_id
  84. self.canal_city_date = canal_city_date
  85. self.canal_spec_id = canal_spec_id
  86. self.canal_spec_date = canal_spec_date
  87. def __repr__(self):
  88. return "<Vacancy('%s','%s', '%s')>" % (self.title, self.specialization, self.href)
  89. class Log(Base):
  90. __tablename__ = 'log'
  91. id = Column(Integer, primary_key=True, autoincrement=True)
  92. action = Column(String(64))
  93. status = Column(String(64))
  94. time = Column(Integer)
  95. donor = Column(String(64))
  96. city = Column(String(20))
  97. specialization = Column(String(20))
  98. vacancies_count = Column(Integer)
  99. canal_id = Column(String(64))
  100. def __init__(self, action, status, time, donor, city, specialization, vacancies_count, canal_id):
  101. self.action = action
  102. self.status = status
  103. self.time = time
  104. self.donor = donor
  105. self.city = city
  106. self.specialization = specialization
  107. self.vacancies_count = vacancies_count
  108. self.canal_id = canal_id
  109. def __repr__(self):
  110. return "<Log('%s','%s', '%s')>" % (self.action, self.status)
  111. # Создание таблицы
  112. Base.metadata.create_all(engine)
  113. Session = sessionmaker(bind=engine)
  114. session = Session()
  115. #"""
  116. #формируем запрос
  117. city, area = random.choice(list(areas.items()))
  118. host = city + ".hh.ru"
  119. #t = random.choice([0,1])
  120. t = 1
  121. if (t == 1):
  122. specialization = random.choice(specializations)
  123. else:
  124. specialization = ""
  125. user_agent = random.choice(user_agents)
  126. params = {
  127. 'clusters': 'true',
  128. 'area': area,
  129. 'enable_snippets': 'true',
  130. 'salary': '',
  131. 'st': 'searchVacancy',
  132. 'text': specialization
  133. }
  134. url = "https://" + host + "/search/vacancy?"+urllib.parse.urlencode(params)
  135. referer = "https://" + host + "/search/vacancy?"+urllib.parse.urlencode(params) + "&customDomain=1"
  136. headers = {
  137. "Host": str(host),
  138. 'User-Agent': str(user_agent),
  139. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  140. 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
  141. 'Accept-Encoding': 'gzip, deflate, br',
  142. 'Referer': str(referer),
  143. 'Upgrade-Insecure-Requests': '1',
  144. 'Connection': 'keep-alive'}
  145. response = requests.get(url, headers=headers, params=params)
  146. post_html = response.text
  147. with open(os.path.join(basedir, 'response.html'), 'w', encoding="utf-8") as f:
  148. f.write(post_html)
  149. """
  150. with open(os.path.dirname(os.path.abspath(__file__)) + '/response.html', 'r', encoding='utf-8') as f:
  151. post_html = f.read()
  152. """
  153. #Парсим вакансии
  154. vacancies_count = 0
  155. soup = BeautifulSoup(post_html, "lxml")
  156. vacancies = soup.find_all('div', {'class': 'vacancy-serp-item'})
  157. for vacancy in vacancies:
  158. try:
  159. vacancy_title = str(vacancy.find('a', {'data-qa': 'vacancy-serp__vacancy-title'}).text).strip()
  160. vacancy_href = str(vacancy.find('a', {'data-qa': 'vacancy-serp__vacancy-title'}).get('href')).strip()
  161. vacancy_href = vacancy_href.split("?")[0]
  162. vacancy_id = vacancy_href.split("/")
  163. vacancy_id = int(vacancy_id[len(vacancy_id) - 1])
  164. #<span class="vacancy-serp-item__publication-date vacancy-serp-item__publication-date_s-only">24.03.21</span>
  165. """
  166. vacancy_date = str(vacancy.find('span', {'class': 'vacancy-serp-item__publication-date vacancy-serp-item__publication-date_s-only'}).text).strip()
  167. vacancy_date = int(time.mktime(datetime.datetime.strptime(vacancy_date, "%d.%m.%y").timetuple()))
  168. """
  169. #<a href="/employer/2936112" class="bloko-link bloko-link_secondary" data-qa="vacancy-serp__vacancy-employer"> Квадратный метр</a>
  170. vacancy_employer = str(vacancy.find('a', {'class': 'bloko-link bloko-link_secondary'}).text).strip()
  171. vacancy_employer_id = str(vacancy.find('a', {'class': 'bloko-link bloko-link_secondary'}).get('href')).strip()
  172. vacancy_employer_id = vacancy_employer_id.replace('/employer/', '')
  173. out_s += str(vacancy_title) + "<br>"
  174. out_s += str(vacancy_href + " :: " + str(vacancy_id)) + "<br>"
  175. #out_s += str(vacancy_date) + "<br>"
  176. out_s += str(vacancy_employer + " :: " + vacancy_employer_id) + "<br>"
  177. #уникальность вакансии
  178. out_s += "Проверяем уникальность\n" + "<br>"
  179. vacancy_n = session.query(Vacancies).filter(Vacancies.href == vacancy_href).count()
  180. out_s += str(vacancy_n) + "<br>"
  181. if (vacancy_n == 0):
  182. out_s += "Добавляем в базу" + "<br>"
  183. new_vacancy = Vacancies(
  184. title = vacancy_title,
  185. city = city,
  186. specialization = specialization,
  187. href = vacancy_href,
  188. donor = 'hh.ru',
  189. vacancy_id = vacancy_id,
  190. vacancy_date = int(time.time()),
  191. parse_date = int(time.time()),
  192. employer=vacancy_employer,
  193. canal_city_id = 0,
  194. canal_city_date = 0,
  195. canal_spec_id = 0,
  196. canal_spec_date = 0
  197. )
  198. session.add(new_vacancy)
  199. vacancies_count += 1
  200. out_s += "<br>"
  201. status = 'Ok'
  202. except Exception as e:
  203. out_s += str(traceback.format_exc())
  204. status = str(traceback.format_exc())
  205. new_log = Log(
  206. action = "parse",
  207. status = status,
  208. time = int(time.time()),
  209. donor = 'hh.ru',
  210. city = city,
  211. specialization = specialization,
  212. vacancies_count = vacancies_count,
  213. canal_id = 0,
  214. )
  215. session.add(new_log)
  216. session.commit()
  217. out_s += "<br>success"
  218. start_response('200 OK', [('Content-Type','text/html')])
  219. b = out_s.encode('utf-8')
  220. return [b]