parser.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. # -*- encoding: utf-8 -*-
  2. #
  3. """
  4. virtual_env = os.path.expanduser('~/projects/world-it-planet/env')
  5. activate_this = os.path.join(virtual_env, 'bin/activate_this.py')
  6. exec(open(activate_this).read(), dict(__file__=activate_this))
  7. """
  8. import os, traceback
  9. from bs4 import BeautifulSoup
  10. import random, time, datetime
  11. import requests
  12. from requests.exceptions import ProxyError
  13. from urllib.parse import urlparse, urljoin
  14. import tldextract
  15. # download stopwords corpus, you need to run it once
  16. import nltk
  17. nltk.download("stopwords")
  18. #--------#
  19. from nltk.corpus import stopwords
  20. from pymystem3 import Mystem
  21. from string import punctuation
  22. #декларативное определение
  23. from sqlalchemy import Column, Integer, String, Text, create_engine
  24. from sqlalchemy.ext.declarative import declarative_base
  25. from sqlalchemy.orm import sessionmaker
  26. from sqlalchemy.sql.expression import func
  27. #---------------------------------- Variables ----------
  28. donors = [
  29. "http://npedkol.ru/",
  30. "https://it-events.com/",
  31. "https://habr.com/ru/all/",
  32. "https://innopolis.university/",
  33. ]
  34. keywords = [
  35. "олимпиада",
  36. "хакатон",
  37. "конкурс"
  38. ]
  39. user_agents = [
  40. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0",
  41. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
  42. "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  43. "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  44. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
  45. "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
  46. "Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0",
  47. "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  48. "Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.16",
  49. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
  50. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 YaBrowser/1.7.1364.21027 Safari/537.22",
  51. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16",
  52. "Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25",
  53. "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.15",
  54. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10.14; rv: 75.0) Gecko / 20100101 Firefox / 75.0",
  55. "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; rv: 74.0) Gecko / 20100101 Firefox / 74.0",
  56. "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / 80.0.3987.163 Safari / 537.36",
  57. "Dalvik/2.1.0 (Linux; U; Android 10; Mi 9T MIUI/V12.0.5.0.QFJMIXM)"
  58. ]
  59. bad_file_extensions = ["pdf", "doc", "docx"]
  60. #---------------------------------- Variables End ----------
  61. out_s = ""
  62. #Инициализация SQLLite
  63. basedir = os.path.abspath(os.path.dirname(__file__))
  64. SQLALCHEMY_DATABASE_URI = 'sqlite:///' + os.path.join(basedir, 'all.db')
  65. engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_pre_ping=True)
  66. Base = declarative_base()
  67. # класс БД собранной информации
  68. class Links(Base):
  69. __tablename__ = 'links'
  70. id = Column(Integer, primary_key=True, autoincrement=True)
  71. donor = Column(String(255))
  72. title = Column(String(512))
  73. href = Column(String(512))
  74. parse_date = Column(Integer)
  75. html = Column(Text)
  76. text = Column(Text)
  77. lemmas = Column(Text) # набор лемм из текста (мешок слов)
  78. level = Column(Integer) # уровень вложенности ссылки от корня сайта
  79. status = Column(Integer) # 0 - не загружена, 1 - загружена, ключевика нет; 2 - ключевик есть, уведомление не отправлено; 3 - уведомление не отправлено
  80. def __init__(self, donor, title, href, parse_date, html, text, lemmas, level, status):
  81. self.donor = donor
  82. self.title = title
  83. self.href = href
  84. self.parse_date = parse_date
  85. self.html = html
  86. self.text = text
  87. self.lemmas = lemmas
  88. self.level = level
  89. self.status = status
  90. def __repr__(self):
  91. return "<Link('%s', '%s')>" % (self.title, self.href)
  92. class Log(Base):
  93. __tablename__ = 'log'
  94. id = Column(Integer, primary_key=True, autoincrement=True)
  95. action = Column(String(64))
  96. status = Column(String(64))
  97. time = Column(Integer)
  98. donor = Column(String(64))
  99. def __init__(self, action, status, time, donor):
  100. self.action = action
  101. self.status = status
  102. self.time = time
  103. self.donor = donor
  104. def __repr__(self):
  105. return "<Log('%s','%s', '%s')>" % (self.action, self.status)
  106. # Создание таблицы
  107. Base.metadata.create_all(engine)
  108. Session = sessionmaker(bind=engine)
  109. sqllite_session = Session()
  110. # https://www.kaggle.com/code/alxmamaev/how-to-easy-preprocess-russian-text/script
  111. #Create lemmatizer and stopwords list
  112. mystem = Mystem()
  113. russian_stopwords = stopwords.words("russian")
  114. #Preprocess function
  115. def preprocess_text(text):
  116. tokens = mystem.lemmatize(text.lower())
  117. tokens = [token for token in tokens if token not in russian_stopwords\
  118. and token != " " \
  119. and token.strip() not in punctuation]
  120. text = " ".join(tokens)
  121. return text
  122. # парсим по случайных 10 ссылок за раз
  123. for i in range(0, 10):
  124. link = sqllite_session.query(Links).filter(Links.status == 0).order_by(func.random()).first()
  125. print("Парсим ", link.href)
  126. #формируем запрос
  127. user_agent = random.choice(user_agents)
  128. donor_parsed = urlparse(link.href)
  129. headers = {
  130. "Host": str(donor_parsed.hostname),
  131. 'User-Agent': str(user_agent),
  132. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  133. 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
  134. 'Accept-Encoding': 'gzip, deflate, br',
  135. 'Referer': str(link.href),
  136. 'Upgrade-Insecure-Requests': '1',
  137. 'Connection': 'keep-alive'}
  138. response = requests.get(link.href, headers=headers)
  139. print("status_code: ", response.status_code)
  140. if (response.status_code == 200):
  141. link.html = response.text
  142. soup = BeautifulSoup(response.text, "lxml")
  143. title = str(soup.find('title').string.strip())
  144. print("title: ", title)
  145. link.title = title
  146. text = soup.get_text()
  147. link.text = text
  148. lemmas = preprocess_text(text)
  149. print(lemmas)
  150. print()
  151. print()